pandas中series序列与dataframe数据帧整理

Pandas数据清洗

Series

带标签的一维数组，与Numpy中的一维array类似，只允许存储相同的数据类型

基本概念

import numpy as np

import pandas as pd #导入全部

from pandas import Series,DataFrame #只导入单个

索引

a 3

b 4

c 5

d 6

s['b'] #索引，获得单个值

s.b

s.get('b')

s[1] #默认索引

切片

切片不会改变其数据类型

s[['b']] #切片

b 4

dtype: int64

s[['a','b']] #切片

a 3

b 4

s[1:3] #默认索引，左闭右开

s['a':'c'] #指定索引切片，两边闭合

a 3

b 4

c 5

s[['a','b']] #选择，选择一个，选择多个

s.index[1] #取索引值

'b'

选择和过滤

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s>3

布尔值

s[s>3]

切片

s[s.index=='a']

s[s.values == 6]

s.index=np.arange(4) #索引覆盖

s.values#获得值

array([3, 4, 5, 6], dtype=int64)

s.index #获得索引值

RangeIndex(start=0, stop=4, step=1)

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

系列的运算

Series数组运算都会保留索引和值之间的链接

序列运算保留索引

s1=s*2

索引不变，元素乘2

np.exp(s1)

返回e的幂次方

序列运算索引自动对齐

s1=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s2=Series([3,4,5,6],index=['b','a','d','c']) #指定索引

s1+s2

a 7

b 7

c 11

d 11

dtype: int64

#索引不存在，引入缺失值

s1=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s2=Series([3,4,5,6],index=['e','a','d','f']) #指定索引

s3=s1+s2索引共同的元素相加，一个没有的缺失

a 7.0

b NaN

c NaN

d 11.0

e NaN

f NaN

dtype: float64

s3.isnull() #缺失值的判断

a False

b True

c True

d False

e True

f True

dtype: bool

s3.notnull()

a True

b False

c False

d True

e False

f False

dtype: bool

s3[s3.notnull()] #把非缺失值选出

a 7.0

d 11.0

dtype: float64

s3.isnull().values.sum() #统计有多少个缺失值

s3.isnull().sum()

序列的名字和索引名字

s1=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s1.name='translate'

a 3

b 4

c 5

d 6

Name: translate, dtype: int64

s1.index.name='letter'

letter

a 3

b 4

c 5

d 6

Name: translate, dtype: int64

序列的创建

列表、元组

s1=Series([3,4,5,6],index=['a','b','c','d']) #列表

s1=Series((3,4,5,6),index=['a','b','c','d']) #元组

标量

s1=Series(5,index=['a','b','c','d']) #标量

全为5‘单值’类型dtype: int64

数组

s1=Series(np.arange(1,5),index=['a','b','c','d']) #数组dtype: int32

字典

s1=Series(dict1) #不指定索引，默认以字典的key作为索引，字典值作为值

s1=Series(dict1,index=['c','d','e','f'])

从字典中取，无的填Nan

s1.index=['d','c','b','a']

冲设索引

Dataframe

带标签的二维的表格型数据结构。可以将DataFrame理解为Series的容器

dataframe的创建

列表的创建

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组

d1=DataFrame(data1)

d1.index # 行索引

RangeIndex(start=0, stop=3, step=1)

d1.columns #列索引

Index(['one', 'two', 'three'], dtype='object')

d1.index=['a','b','c'] #创建dataframe后添加行索引和列索引

d1.columns=['one','two','three']

d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three']) #创建的时候加上行索引和列索引

二维数组的创建

df2=DataFrame(np.arange(16).reshape((4,4))) #数组转换为dataframe

等长列表、元组、数组、序列组成的字典

data1={'c':['1','2'],'a':['5','6']} #把key抽出形成columns

DataFrame(data1)

data1={'c':('1','2'),'a':('5','6')}

DataFrame(data1)

data1={'c':np.arange(4),'a':np.arange(4)}

DataFrame(data1)

data1={'c':Series([1,2]),'a':Series([3,4,5])}

DataFrame(data1)

字典组成的字典

nest_dict={'shanghai':{2015:100,2016:101},'beijing':{2015:102,2016:103,2017:109}}

DataFrame(nest_dict) #外层的key形成columns，里层的key成为index

字典或Series的列表

data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

DataFrame(data)

data2 = [Series([1, 2],index=['a','b']), Series([1,2],index=['a','b'])]

DataFrame(data2)

数据的加载和查看

数据的读写

csv文件

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组

d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three']) #创建的时候加上行索引和列索引

d1.to_csv('out1.csv',sep=',', header=True) #写入csv,带header

d1.to_csv('out2.csv',sep=',', header=False) #写入csv,不带header

d2=pd.read_csv('out1.csv')

d2.set_index('Unnamed: 0')

pd.read_csv('out2.csv',header=None,names=['one','two','three','four']) #可以通过name加columns

excel文件

d2=pd.read_excel(r'D:\CDA\Dataclearing\NEW\test.xlsx') #读

d2.head(5) #查看前5行

d2.to_excel('test3.xlsx') #写

json格式

df1=pd.read_json(html)

df1.head()

df2=df1[['rank','title','regions','score']]

df2

#爬虫方式获取数据

import requests #爬虫请求库

url='https://movie.douban.com/j/chart/top_list?type=25&interval_id=100%3A90&action=&start=0&limit=20'

header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

response = requests.get(url, headers=header)

html=response.text

pd.read_json(html)[['rank','regions','title','score','actors']].head()

df2.to_json('a.json',force_ascii=False) #保存,默认force_ascii为True，不以ascii编码显示

pd.read_json('a.json') #读取

html文件

df1=pd.read_html('http://stock.eastmoney.com/')

len(df1)

type(df1)=list

clipboard文件读取

pd.read_clipboard(header=None)

数据的查看

d2=pd.read_excel('test.xlsx') #读

d2.head()

d2.index=np.arange(1,201)

d2.head(3) #查看前面几行

d2.tail(4) #最后几行

d2.sample(n=4) #随机的抽取4行

d2.shape #形状

d2.info() #数据类型，缺失值

d2.dtypes #查看数据类型

d2.iloc[0:3,1]=np.NaN

d2.info() #数据类型，缺失值

d2.isnull().head()

d2.describe() #统计描述信息，非空的信息

d2.describe(include= object) #统计信息

索引和过滤

序列的索引和过滤

索引

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s.b得出元素

s[['a','c','d']]

s[1:3]左闭右开

s.iloc[1:3]

s['a':'c']

s.loc['a':'c']

过滤

s[s>4] #选择大于4

s[(s>4)&(s<6)]

dataframe的索引和过滤

import numpy as np

np.random.seed(10)

df = DataFrame(np.random.rand(6,4)*2-1)

df.index = ['a', 'b', 'c', 'd', 'e', 'f']

df.columns = ['A', 'B', 'C', 'D']

索引

选择一行和多行[一个中括号]

#选择行

df[:1] #选择第0行，默认的索引选择行

df[2:3] #选择多行,左闭右开，得第2行

df[2::2] #选择多行,左闭右开

df['c':'e'] #选择多行,指定的索引选择多行,两边闭合，选出3行

df['c':'c']选出c行

选择列[[两个中括号]]

df['A'] #选出的是序列

df[['A']] #选择一列，dataframe

df[['A','C','B']] #选择多列

行，列的选择,loc方式,用指定的索引

df.loc['a':'c',['A','D']] #选择多行和多列,指定的索引和列名

df.loc[:,['A','B']] #选择多行和多列

df.loc['b':'d':2,'A':'C'] #选择多行和多列，指定范围

选出一个出局

df.loc['b':'b','A':'A']

iloc方式选择多行和多列，用默认的索引

df.iloc[:,[0,2]]

所有行2列

df.iloc[2:4]

选出2行所有列

df.iloc[2:4:2,2:4]

过滤

df[df>0] # 选择大于零的所有数据

#选择a行大于0的所有列

df.loc['a']>0得出序列

df.loc[:,df.loc['a']>0]

得出dataframe

#选择C列里面大于零的所有行

df['C']>0

df.loc[df['C']>0,:]

#选择a行大于零，A列大于零

df.loc[df['A']>0,df.loc['a']>0]

#选择a行大于零，b行小于零，A列大于零，B列小于零

df.loc[(df['A']>0)&(df['B']<0),(df.loc['a']>0)&(df.loc['b']<0)]

#A列和C列都是非空的所有行，得出序列

df1['A'].notnull()&df1['C'].notnull()

df1[df1['A'].notnull()&df1['C'].notnull()]

添加列

df['E']=['one','one','two','two','three','three']

判断，得序列

df['E']=='two'

#选择E列是two的所有行

df[df['E']=='two']

df[df.E.isin(['two'])]

E列以t开头

df.E.str.startswith('t')

df[df.E.str.startswith('t')] #优先考虑这种

df[df.E.map(lambda x:x.startswith('t'))] #对字符串的操作

重新索引和填充

序列的索引和插值

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s.reindex(['a','d','e','f']) #原索引值不在，引入缺失值

s.reindex(['a','d','e','f'],fill_value=0) #原索引值不在，填充0

s=Series(['a','b','c'],index=[0,2,4]) #index为数值，时间

s.reindex(range(6),fill_value='a') #插值，上采样，已存在得值不变

s.reindex(range(6),method='ffill') #重复操作，前向填充，填充与前一个值相同

s.reindex(range(6),method='bfill') #重复操作，后向填充，填充与后一个值相同

dataframe的重新索引和插值

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组

d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three'])

d1.reindex(['a','b','c','d']) #重索引行，引入缺失值

d1.reindex(['a','b','c','d'],method='ffill') #重索引行，填充

d1.reindex(columns=['one','two','three','four'],fill_value=0) #重索引列，引入缺失值

d1.reindex(columns=np.arange(4),method='pad') #重索引列,跟版本的有关

#对列进行填充，索引要单调递增或者递减

数据清洗

增加和删除

序列的增加和删除

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s['e']=5 #增加类字典

s['a']=0 #修改

s.drop('a') #删除

s.drop(['a','c'])#删除多个，新生成

未改变原数据

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s.drop(['a','c'],inplace=True)#删除多个,原地删除

s.pop('b') #弹出所删除的值

dataframe的增加和删除

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组

d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three'])

d1.loc['d']=[7,7,7]# 增加一行，如果原来的行存在则进行修改

d1.loc['e']=9 #标量自动扩展

#两个dataframe如何增加行

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组

d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three'])

d2=DataFrame(data1,index=['a','b','c'],columns=['one','two','three'])

增加

d1=d1.append(d2) #增加多行，新生成操作改变原数据，原地操作

d1['four']=[7,8,9]#长度要一致

d1['five']=10 #标量自动扩展

d1['six']=d1['five']+d1['one']

d1.sum()

按列求和，序列

d1['seven']=d1.sum(axis=1)

df形式

删除

d1.drop('a',axis=0) #删除行，默认情况axis=0

d1.drop('six',axis=1) #删除列

d1.drop(['one','six'],axis=1) #删除多列

d1.drop(['a','b'],axis=0) #删除多行