模块一:处理班级标签(重点:字符串处理,分列合并,长度填充等)
import pandas as pd
import os
import numpy as np
import datetime
today=str(datetime.date.today())
filepath='/Users/kangyongqing/Documents/kangyq/202311/班均及合班储备/最后校验/二批次组班/'
file1='02班级时间线上查询2023-12-19.xlsx'
file2='05新班级组班明细2023-12-19的副本.xlsx'
df1=pd.read_excel(filepath+file1,dtype='object')
print(df1.columns)
piv1=pd.pivot_table(df1,index='shijian',values='班级id',aggfunc='count')
piv2=pd.pivot_table(df1,index='xingqi',values='班级id',aggfunc='count')
piv3=pd.pivot_table(df1,index='班级id',values='xingqi',aggfunc='count')
df1.loc[:,'小时']=df1['shijian'].str.split(':').str[0]
df1.loc[:,'分钟']=df1['shijian'].str.split(':').str[1]
df1.loc[:,'规则小时']=df1['小时'].str.zfill(width=2)
df1.loc[:,'规则分钟']=df1['分钟'].str.zfill(width=2)
print(set(df1['小时']))
print(set(df1['规则小时']))
print(set(df1['分钟']))
print(set(df1['规则分钟']))
df1.loc[:,'规则时间']=df1['规则小时']+df1['规则分钟']
print(df1.head())
df1.sort_values(by=['班级id','xingqi'],ascending=True,inplace=True)
print(df1.head())
group=df1.groupby('班级id')
xq=pd.DataFrame({'星期集合':group['xingqi'].apply(lambda x:''.join([str(i) for i in x])),
'时间集合':group['规则时间'].apply(lambda y:''.join([str(u) for u in y]))}).reset_index()
df2=pd.merge(df1,xq,on='班级id',how='left')
df2.loc[:,'规则星期集合']=df2['星期集合'].str.zfill(width=2)
df2.loc[:,'班型时间']=df2['规则星期集合'].str.cat(df2['时间集合'],sep='')
print(df2.head())
df3=pd.DataFrame(df2.loc[:,['班级id','班型时间']])
df3.drop_duplicates(inplace=True)
print(df3.shape)
print(df3.head())
df4=pd.read_excel(filepath+file2,dtype='object')
df5=pd.merge(df4,df3,on='班级id',how='left')
df5.loc[:,'区域']=df5['班级名称'].str.split('-').str[0]
df5.loc[:,'版本']=df5['班级名称'].str.split('-').str[1]
df5.loc[:,'进度']=df5['班级名称'].str.split('-').str[2]
df5.loc[:,'班型名称']=df5['区域'].str.cat(df5['版本'],sep='-').str.cat(df5['进度'],sep='-').str.cat(df5['班型时间'],sep='-')
print(df5.columns)
print(df5.shape)
writer=pd.ExcelWriter(filepath+f'03新班级对应上课时间{today}.xlsx')
# piv1.to_excel(writer,sheet_name='时间透视')
# piv2.to_excel(writer,sheet_name='星期透视')
# piv3.to_excel(writer,sheet_name='班级透视')
df5.to_excel(writer,sheet_name='新班级上课时间',index=False)
writer._save()
模块二:字符串处理(重点:多种形式的字符串替换及分列合并)
import pandas as pd
import os
import numpy as np
import datetime
today=str(datetime.date.today())
filepath='/Users/kangyongqing/Documents/kangyq/202311/班均及合班储备/最后校验/二批次组班/'
file1='小班课迁移二批次插班名单整合2023-12-20处理.xlsx'
df1=pd.read_excel(filepath+file1,dtype='object')
print(df1.columns)
print(df1.head())
df1['地区']=df1['地区'].replace({'亚洲':'A','欧洲':'E','大洋洲':'O'})
print(set(df1['地区']))
df1['weekday']=df1['weekday'].str.replace('星期','')
df1['weekday']=df1['weekday'].str.replace(',','')
translate_table=str.maketrans('一二三四五六日','1234567')
df1['weekday']=df1['weekday'].str.translate(translate_table)
df1['weekday']=df1['weekday'].str.zfill(width=2)
print(set(df1['weekday']))
df1['转入课程']=df1['转入课程'].str.upper()
df1.loc[:,'版本']=df1['转入课程'].str.split(' ').str[0]+'TM'
df1['版本']=df1['版本'].str.replace('C2','')
print(set(df1['版本']))
df1.loc[:,'level']=df1['转入课程'].str.split(' ').str[2]
print(set(df1['level']))
df1.loc[:,'版本level']=df1['版本'].str.cat(df1['level'],sep='')
print(set(df1['版本level']))
df1['课程进度']=df1['课程进度'].replace({'正常进度':'NOR','非正常进度':'ABN'})
df1['进度']=df1['课程进度'].str.cat(df1['课频'].astype(str),sep='')
print(set(df1['进度']))
df1.loc[:,'时间']=df1['time'].str.replace(':','')
df1['时间']=df1['时间'].str.replace(',','')
df1.loc[:,'班型名称']=df1['地区'].str.cat(df1['版本level'],sep='-').str.cat(df1['进度'],sep='-').str.cat(df1['weekday'],sep='-').str.cat(df1['时间'],sep='')
print(df1.head())
print(df1.shape)
df2=pd.DataFrame(df1.loc[:,['student_user_id','班型名称']])
df2.rename(columns={'student_user_id':'学员id'},inplace=True)
writer=pd.ExcelWriter(filepath+f'04插班生班型名称{today}.xlsx')
df2.to_excel(writer,sheet_name='插班学生明细',index=False)
writer._save()
模块三:特殊需求处理(重点:lesson1的level下降一级,长字符串的处理)
import pandas as pd
import os
import numpy as np
import datetime
today=str(datetime.date.today())
filepath='/Users/kangyongqing/Documents/kangyq/202311/班均及合班储备/最后校验/二批次组班/'
file1='小班课迁移二批次插班名单整合.xlsx'
df1=pd.read_excel(filepath+file1,dtype='object')
print(df1.columns)
print(df1.head())
df1.loc[:,'level']=df1['转入课程'].str.split(' ').str[2].str.slice(1)
df1.loc[:,'lesson']=df1['转入课程'].str.split(' ').str[3].str.slice(1)
df1.loc[:,'处理level']=np.where(df1['lesson']=='1',df1['level'].astype('int')-1,df1['level'].astype('int'))
df1.loc[:,'新level']='L'+df1['处理level'].astype(str)
df1.loc[:,'前部']=df1['转入课程'].str.rpartition(' ')[0].str.rpartition(' ')[0]
df1.loc[:,'新转入课程']=df1['前部'].str.cat(df1['新level'],sep=' ').str.cat('L'+df1['lesson'],sep=' ')
print(df1.iloc[:,[5,6,7,8,9,10,11,12]].head())
df1.to_excel(filepath+f'小班课迁移二批次插班名单整合{today}处理.xlsx',index=False)
模块四:分批次插班迭代(重点:根据实际数据形态按指定规则插班,并输出结果明细及影响分析)
import pandas as pd
import os
import numpy as np
import datetime
today=str(datetime.date.today())
filepath='/Users/kangyongqing/Documents/kangyq/202311/班均及合班储备/最后校验/二批次组班/'
file1='03新班级对应上课时间2023-12-20.xlsx'
file2='04插班生班型名称2023-12-20.xlsx'
df1=pd.read_excel(filepath+file1,usecols=['学员id','班级id','班型名称','班级名称'],dtype='object')
df1['标签']='已创建新班'
df2=pd.read_excel(filepath+file2,dtype='object')
print(df1.columns,df1.shape)
print(df2.columns,df2.shape)
df3=pd.DataFrame(pd.pivot_table(df1,index=('班级id','班型名称'),values='学员id',aggfunc='count').reset_index())
df3.rename(columns={'学员id':'班容'},inplace=True)
df3.loc[:,'可容纳']=6-df3['班容']
print(df3.head())
print(df3.shape,df3['班容'].sum())
df4=pd.DataFrame(df3.loc[df3['可容纳'].between(1,3,inclusive='both'),:])
print(df4.shape,df4['班容'].sum())
df4.sort_values(by=['班型名称','班容'],ascending=True,inplace=True)
df4.loc[:,'索引']=range(len(df4))
df4.loc[:,'班级序号']=df4.groupby('班型名称')['索引'].rank(method='dense')
print(df4.head(10))
pici=df4.groupby('班级序号')['班级id'].count()
print(pici)
df5=pd.DataFrame(df4.loc[df4['班级序号']==1,:])
df52=pd.DataFrame(df4.loc[df4['班级序号']==2,:])
df53=pd.DataFrame(df4.loc[df4['班级序号']==3,:])
df6=pd.merge(df5,df2,on='班型名称',how='left')
print(df6.columns)
print(df6.shape)
df61=pd.DataFrame(df6.loc[df6['学员id'].notnull(),:])
print(df61.shape)
df61.loc[:,'学生索引']=range(len(df61))
df61.loc[:,'学生序号']=df61.groupby('班级id')['学生索引'].rank(method='dense')
df61['插班一']='插班一'
df62=df61[df61['学生序号'].astype('int')<=df61['可容纳'].astype('int')]
print(df62.shape)
print(df62.head())
#第一次插班完毕
#第二次插班开始
df22=pd.merge(df2,df62.loc[:,['学员id','插班一']],on='学员id',how='left')
df222=pd.DataFrame(df22.loc[df22['插班一'].isnull(),:]) #排除第一批插班成功的学生
print(df222.shape)
df7=pd.merge(df52,df222,on='班型名称',how='left')
print(df7.columns)
print(df7.shape)
df71=pd.DataFrame(df7.loc[df7['学员id'].notnull(),:])
print(df71.shape)
df71.loc[:,'学生索引']=range(len(df71))
df71.loc[:,'学生序号']=df71.groupby('班级id')['学生索引'].rank(method='dense')
df71['插班二']='插班二'
df72=df71[df71['学生序号'].astype('int')<=df71['可容纳'].astype('int')]
print(df72.shape)
print(df72.head())
#第二次插班完毕
#第三次插班开始
df23=pd.merge(df222,df72.loc[:,['学员id','插班二']],on='学员id',how='left')
df233=pd.DataFrame(df23.loc[df23['插班二'].isnull(),:]) #排除第二批插班成功的学生
print(df233.shape)
df8=pd.merge(df53,df233,on='班型名称',how='left')
print(df8.columns)
print(df8.shape)
df81=pd.DataFrame(df8.loc[df8['学员id'].notnull(),:])
print(df81.shape)
df81.loc[:,'学生索引']=range(len(df81))
df81.loc[:,'学生序号']=df81.groupby('班级id')['学生索引'].rank(method='dense')
df81['插班三']='插班三'
df82=df81[df81['学生序号'].astype('int')<=df81['可容纳'].astype('int')]
print(df82.shape)
print(df82.head())
#第三次插班完毕
#第四次插班开始,若有的话
df24=pd.merge(df233,df82.loc[:,['学员id','插班三']],on='学员id',how='left')
df244=pd.DataFrame(df24.loc[df24['插班三'].isnull(),:]) #排除第二批插班成功的学生
print(df244.shape)
print(df244.head())
#无法插班学生汇总
#已插班学生汇总
df9=pd.concat((df62.loc[:,['学员id','班级id','班型名称']],df72.loc[:,['学员id','班级id','班型名称']],df82.loc[:,['学员id','班级id','班型名称']]),axis=0)
print(df9.shape)
df9['班级名称']=''
df9['标签']='二批次插班'
print(df9.head())
df10=pd.concat((df1,df9),axis=0)
print(df10.shape)
print(df10.head())
df11=pd.pivot_table(df10,index='标签',values='学员id',aggfunc='count',margins=True)
df120=pd.pivot_table(df10[df10['标签']=='已创建新班'],index='班级id',values='学员id',aggfunc='count').reset_index().rename(columns={'学员id':'班容'})
df130=pd.pivot_table(df120,index='班容',values='班级id',aggfunc='count',margins=True)
df12=pd.pivot_table(df10,index='班级id',values='学员id',aggfunc='count').reset_index().rename(columns={'学员id':'班容'})
df13=pd.pivot_table(df12,index='班容',values='班级id',aggfunc='count',margins=True)
df13.loc[:,'班级数变化']=df13['班级id']-df130['班级id']
qbj=np.round(len(df1)/len(df1['班级id'].unique()),2)
print(qbj)
hbj=np.round(len(df10)/len(df10['班级id'].unique()),2)
print(hbj)
writer=pd.ExcelWriter(filepath+f'06二批次可插班明细{today}.xlsx',engine='openpyxl')
# df4.to_excel(writer,sheet_name='班级排序')
# pici.to_excel(writer,sheet_name='批次班级数')
# df5.to_excel(writer,sheet_name='第一次插班班级明细')
df10.to_excel(writer,sheet_name='插班后学员明细',index=False)
df11.to_excel(writer,sheet_name='批次统计')
df130.to_excel(writer,sheet_name='插班前班容分布')
df13.to_excel(writer,sheet_name='插班后班容分布')
wb=writer.book.create_sheet('班均')
wb.cell(row=1,column=1).value='插班前班均'
wb.cell(row=1,column=2).value='插班后班均'
wb.cell(row=2,column=1).value=qbj
wb.cell(row=2,column=2).value=hbj
writer._save()
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
def reset_col(filename):
wb=load_workbook(filename)
for sheet in wb.sheetnames:
ws=wb[sheet]
df=pd.read_excel(filename,sheet).fillna('-')
df.loc[len(df)]=list(df.columns) #把标题行附件到最后一行
for col in df.columns:
index=list(df.columns).index(col) #列序号
letter=get_column_letter(index+1) #列字母
collen=df[col].apply(lambda x:len(str(x).encode())).max() #获取这一列长度的最大值 当然也可以用min获取最小值 mean获取平均值
ws.column_dimensions[letter].width=collen*1.2+2 #也就是列宽为最大长度*1.2 可以自己调整
wb.save(filename)
reset_col(writer)