【pandas】常用方法积累
不同pd对象基于同一列进行数据对齐
创建数据:
import pandas as pd
import torch
# 创建 user DataFrame 示例
user_data = {
'id': ['u1233', 'u456', 'u78', 'u101', 'u112'], # 用户ID字符串,长度不一
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], # 用户姓名
'age': [25, 30, 22, 35, 28] # 用户年龄
}
user = pd.DataFrame(user_data)
# 创建 split DataFrame 示例
split_data = {
'id': ['u456', 'u101', 'u78', 'u112', 'u1233'], # 与 user 中的 id 对应
'mask': ['train', 'test', 'val', 'train', 'test'] # 指定每个用户的集合
}
split = pd.DataFrame(split_data)
output:
user DataFrame:
id name age
0 u1233 Alice 25
1 u456 Bob 30
2 u78 Charlie 22
3 u101 David 35
4 u112 Eve 28
split DataFrame:
id mask
0 u456 train
1 u101 test
2 u78 val
3 u112 train
4 u1233 test
# 1. 合并user和split,使user包含mask列
merged_df = pd.merge(user, split[['id', 'mask']], on='id', how='inner')
merged_df
# 2. 根据mask标签进行分组
train_df = merged_df[merged_df['mask'] == 'train']
test_df = merged_df[merged_df['mask'] == 'test']
val_df = merged_df[merged_df['mask'] == 'val']
train_df, test_df, val_df
output :
id name age mask
0 u1233 Alice 25 test
1 u456 Bob 30 train
2 u78 Charlie 22 val
3 u101 David 35 test
4 u112 Eve 28 train
( id name age mask
1 u456 Bob 30 train
4 u112 Eve 28 train,
id name age mask
0 u1233 Alice 25 test
3 u101 David 35 test,
id name age mask
2 u78 Charlie 22 val)