导入数据
df1=pd.read_excel('.xlxs',sheetname='')
df2=...
df=pd.concat([df1,df2,df3],axis=0) #axis=0>>>detail_id
df.reset_index(drop=True,inplace=True)
df.shape
df,info()
清洗数据
检查是否存在重复值
df.duplicated().sum()
删除缺失的字段
df.dropna(axis=1,inplace=True)
删除无用的字段
df[''].unique()
df.drop(['A','B'],axis=1,inplace=True)
df.head()
dishes_name字段处理
df['dish_names']=df['dish_names'].str,strip('[]')
[33]:
#繁体简体转换
from langconv import *
def simple2tradition(line):
#将简体转换成繁体
line = Converter('zh-hant').convert(line)
return line
def tradition2simple(line):
# 将繁体转换成简体
line = Converter('zh-hans').convert(line)
return line
simplified_sentence = tradition2simple('憂郁的臺灣烏龜')
print(simplified_sentence)
traditional_sentence = simple2tradition('忧郁的台湾乌龟')
print(traditional_sentence)
忧郁的台湾乌龟
憂郁的臺灣烏龜
from langconv import *
def tradition2simple(line):
# 将繁体转换成简体
line = Converter('zh-hans').convert(line)
return line
df['dish_names']=df['dish_names'].apply(tradition2simple)
添加money列
df.insert(6,'money',df['counts']*df['amounts'])
数据分析
一共有多少笔订单
df['order_id'].nunique()
有多少个不同的菜
df['dish_names'].nunique()
统计菜品价格的平均值
df.drop_duplicates(subset='dish_names')['amounts'].mean()
什么菜最受欢迎(点的次数最多)
df['dish_names'].value_counts()
哪个菜吃的份数最多
df[['dish_names','amounts']].groupby('dish_names').sum().nlargest(10,columns='amounts')
哪个用户id点的菜最多
##把detail_id看作是用户id
df[['detail_id','counts']].groupby('detail_id').sum().nlargest(10,columns='counts')
哪个id吃的钱数多
df[['detail_id','money']].groupby('detail_id').sum().nlargest(10,columns='money')
哪个id吃的平均菜价贵
df1=df[['detail_id','money']].groupby('detail_id').sum()
df2=df[['detail_id','counts']].groupby('detail_id').sum()
s=df1['money']/df2['counts']
s.nlargest(10)
时间相关分析
一天什么时候吃饭最多(点的菜)
df['hour']=df['place_order_time'].dt.hour
df['hour'].value_counts().sort_index().plot(kind='bar')
哪一天吃饭的人最多
df['day']=df['place_order_time'].dt.day
df['day'].value_counts().sort_index().plot(kind='bar')
星期几吃饭的人最多
df['weekday']=df['place_order_time'].dt.weekday+1
df['weekday'].value_counts().sort_index().plot(kind='bar')
·
·








暂无数据