数据合并
- 1. 知识点
- 1.27 左连接
- 1.28 数据填充与交叉连接
- 1.29 获取列值列表
- 题目
- 2.26 合作过至少三次的演员和导演
- 2.27 使用唯一标识码替换员工ID
- 2.28 学生们参加各科测试的次数
- 2.29 至少有5名直接下属的经理
- 2.30 销售员
1. 知识点
1.27 左连接
data=pd.merge(employees,employee_uni,on='id',how='left')
1.28 数据填充与交叉连接
- 交叉合并
df = pd.merge(students,subjects,how='cross')
- 空值填充
result['attended_exams'] = result['attended_exams'].fillna(0)
1.29 获取列值列表
data_max_id=employee.groupby('managerId').agg(counts=('managerId','count')).reset_index().query('`counts`>=5')['managerId'].tolist()
题目
2.26 合作过至少三次的演员和导演
import pandas as pd
def actors_and_directors(actor_director: pd.DataFrame) -> pd.DataFrame:
actor_director=actor_director.groupby(['actor_id','director_id']).agg(counts=('director_id','count')).reset_index()
actor_director=actor_director.query('`counts`>=3')[['actor_id','director_id']]
actor_director.drop_duplicates(inplace=True)
return actor_director
2.27 使用唯一标识码替换员工ID
import pandas as pd
def replace_employee_id(employees: pd.DataFrame, employee_uni: pd.DataFrame) -> pd.DataFrame:
data=pd.merge(employees,employee_uni,on='id',how='left')
data=data[['unique_id','name']]
return data
2.28 学生们参加各科测试的次数
import pandas as pd
def students_and_examinations(students: pd.DataFrame, subjects: pd.DataFrame, examinations: pd.DataFrame) -> pd.DataFrame:
# 交叉合并
df = pd.merge(students,subjects,how='cross')
# 计算参加的同学
group = examinations.groupby(['student_id','subject_name']).agg(attended_exams=('subject_name','count')).reset_index()
# 左连接
result = pd.merge(df,group,on=['student_id','subject_name'],how='left')
# 为空的未参加,填充为0
result['attended_exams'] = result['attended_exams'].fillna(0)
result = result.sort_values(by=['student_id','subject_name'])
return result[['student_id', 'student_name', 'subject_name', 'attended_exams']]
2.29 至少有5名直接下属的经理
import pandas as pd
def find_managers(employee: pd.DataFrame) -> pd.DataFrame:
data_max_id=employee.groupby('managerId').agg(counts=('managerId','count')).reset_index().query('`counts`>=5')['managerId'].tolist()
results=employee.query(f'`id` in {data_max_id}')[['name']]
return results
2.30 销售员
import pandas as pd
def sales_person(sales_person: pd.DataFrame, company: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
# 找到与RED有关的,剩余的就是没有关的
# 方法一:连表
data_c_o=pd.merge(orders,company,on='com_id',how='left')
data_c_o=data_c_o[data_c_o['name']=='RED']
data_s_o=pd.merge(data_c_o,sales_person,on='sales_id',how='left')
un=data_s_o['sales_id'].unique().tolist()
results=sales_person.query(f'`sales_id` not in {un}')[['name']]
return results
# 方法二:不连表
all_com_id=company.query('`name`=="RED"')['com_id'].values.tolist()
all_sales_id=orders.query(f'`com_id` in {all_com_id}')['sales_id'].values.tolist()
results=sales_person.query(f'`sales_id` not in {all_sales_id}')[['name']]
return results