读取数据以及数据整理

[25]:

with open( 'QQ.txt',mode='r',encoding='utf-8') as file:

txt=file.read()

txt

分别提取时间,用户名,聊天内容

[27]:

#正则表达式提取

#提取信息

import re

re_pat='20[\d-]{8}\s[\d:]{7,8}\s.*[\)>]'#正则表达式,'.*'代表任意组合

log_title_arr=re.findall(re_pat,txt) # 记录头数组['2016-06-24 15:42:52 张某(40**21)',…]

log_content_arr=re.split(re_pat,txt) # 记录内容数组['\n', '\n选修的\n\n', '\n就怕这次…]，以匹配到的字符当做列表分隔符

# 得到两个列表

log_title_arr[0:2:]

log_content_arr[0:2:]

[29]:

len(log_title_arr)

[29]:

4663

[30]:

len(log_content_arr) #content多了一列

[30]:

4664

log_content_arr.pop(0) # 剔除掉第一个（分割造成的冗余部分）

[32]:

len(log_content_arr)

[32]:

4663

# 将两个列表log_title_arr和log_content_arr转换为数据帧

df=pd.DataFrame()

df['记录头']=log_title_arr

df['内容']=log_content_arr

df.head(3)

分离日期、用户名、QQ号

df['日期']=df['记录头'].str.findall('[\d-]{8}\s[\d:]{7,8}').str.get(0)

# df['日期']=df['记录头'].str.findall('\d{4}-\d{2}-\d{2} \d{1,2}:\d{2}:\d{2}').str.get(0)

df['日期'].head(3)

df.head(3)

[46]:

df['QQ名']=df['记录头'].str.findall('\d{2}:\d{2} (.*)[\(<]').str.get(0)

df.head(3)

# 检查数据

df['日期'].unique()

df['QQ名'].unique()

df['QQ号']=df['记录头'].str.findall('[\(<]([0-9A-Z@\.]+)[\)>]').str.get(0) #'+'代表任意组合

df['QQ号'].unique()

[65]:

df.drop('记录头',axis=1,inplace=True)

df.head(3)

对聊天内容处理

[68]:

df['内容']=df['内容'].str.strip('\n')

df['内容'].unique()

删除名字叫做系统消息

[72]:

df1=df[df['QQ名']!='系统消息']

[73]:

df1.reset_index(drop=True,inplace=True)

df1.info()

探索性分析

谁说过话

[77]:

df1['QQ号'].nunique()

[77]:

谁是话痨

df1['QQ号'].value_counts()

话痨聊啥

df[df['QQ号']=='2766158399']['内容'].str.cat(sep='.')

聊天密度周分布

[93]:

# 忽略警告

import warnings

warnings.filterwarnings("ignore")

df1.head(3)

df1.info()

df1['日期']=pd.to_datetime(df1['日期'])

[96]:

df1['周']=df1['日期'].dt.weekday+1

df1['周'].value_counts().sort_index().plot(kind='bar')

聊天密度小时分布

[102]:

df1['小时']=df1['日期'].dt.hour

df1['小时'].value_counts().sort_index().plot(kind='bar')

聊天密度日期分布

[111]:

df1['date']=df1['日期'].dt.date

df1['date'].value_counts().sort_index().plot(kind='bar')

活跃天数最多的用户

[117]:

df2.info()

Int64Index: 895 entries, 0 to 4489

Data columns (total 7 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 内容 895 non-null object

1 日期 895 non-null datetime64[ns]

2 QQ名 895 non-null object

3 QQ号 866 non-null object

4 周 895 non-null int64

5 小时 895 non-null int64

6 date 895 non-null object

dtypes: datetime64[ns](1), int64(2), object(4)

memory usage: 55.9+ KB

df2=df1.drop_duplicates(subset=['QQ号','date'])

df2['QQ号'].value_counts()

聊啥

txt=df1['内容'].str.cat(sep='。')

txt

import jieba

L=jieba.lcut(txt)

L1=[word for word in L if len(word)>1]

去停用词

#去停用词

stop_words=['图片','表情','id','北京','会议']

[133]:

L1=[word for word in L if (len(word)>1) & (word not in stop_words) ]

type(L1)

L1[0:3:]

# 创建序列

s=pd.Series(L1).value_counts()

s[0:3:]

# 序列转为列表

data=s.reset_index().values.tolist()

data[0:3:]

#词云图

import pyecharts.options as opts

from pyecharts.charts import WordCloud

(

WordCloud()

.add(series_name="qq聊天", data_pair=data, word_size_range=[20, 66])

.set_global_opts(

title_opts=opts.TitleOpts(

title="qq聊天", title_textstyle_opts=opts.TextStyleOpts(font_size=23)

tooltip_opts=opts.TooltipOpts(is_show=True),

)

.render("qq聊天.html")

)