pandas Series 和 DataFrame

from pandas import Series,DataFrame

s=Series([3,4,5,6])

#查看数据类型 type(s)

#获得值 s.values

#获得索引值 s.index

#指定索引

s = Series([3,4,5,6],index=['a','b','c','d'])

#索引，获得单个值 s['b']，s.b，s.get('b')，s[1] #默认索引

#切片 s[['a','b']]

s[1:3] #默认索引，左闭右开

s['a':'c'] #指定索引切片，两边闭合

s.index[1] #取索引值

#选择和过滤

s[s>3] # 结果4,5,6

s[s.index=='a'] #按照指定索引查找

s[s.values==6] #按照值显示切片结果 d 6

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s3.isnull() #缺失值的判断

s3.isnull().values.sum() #统计有多少个缺失值

s3[s3.notnull()] #把非缺失值选出

s1.name='translate' #序列的名字

s1.index.name='letter' #索引的名字

序列创建

s1=Series([3,4,5,6],index=['a','b','c','d']) #列表

s2=Series((3,4,5,6),index=['a','b','c','d']) #元组

s1=Series(5,index=['a','b','c','d']) #标量

s1=Series(np.arange(1,5),index=['a','b','c','d']) #数组

dict1={'a':1,'b':3,'c':5,'d':8}

s1=Series(dict1) #不指定索引，默认以字典的key作为索引，字典值作为值

s1=Series(dict1,index=['c','d','e','f'])

DataFrame 创建

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组
d1=DataFrame(data1)

d1.index # 行索引

d1.columns #列索引

d1.index=['a','b','c'] #创建dataframe后添加行索引和列索引
d1.columns=['one','two','three']

d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three']) #创建的时候加上行索引和列索引

二维数组的创建

df2=DataFrame(np.arange(16).reshape((4,4))) #数组转换为dataframe

data1={'c':['1','2'],'a':['5','6']} #把key抽出形成columns 等长列表、元组、数组、序列组成的字典

DataFrame(data1)

字典组成的字典

nest_dict={'shanghai':{2015:100,2016:101},'beijing':{2015:102,2016:103,2017:109}}
DataFrame(nest_dict) #外层的key形成columns，里层的key成为index

字典或Series的列表

data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
DataFrame(data)

图片.png

data2 = [Series([1, 2],index=['a','b']), Series([1,2],index=['a','b'])]
DataFrame(data2)

图片.png

数据的读取

csv文件

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组
d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three']) #创建的时候加上行索引和列索引

d1.to_csv('out1.csv',sep=',', header=True) #写入csv,带header

d1.to_csv('out2.csv',sep=',', header=False) #写入csv,不带header

d2=pd.read_csv('out1.csv')

pd.read_csv('out2.csv',header=None,names=['one','two','three','four']) #可以通过name加columns

d2=pd.read_excel(r'D:\CDA\Dataclearing\NEW\test.xlsx') #读
d2.head(5) #查看前5行

d2.to_excel('test3.xlsx') #写

df1=pd.read_json(html)
df1.head()

df2=df1[['rank','title','regions','score']]

df2.to_json('a.json',force_ascii=False) #保存,默认force_ascii为True，不以ascii编码显示

d2=pd.read_excel('test.xlsx') #读
d2.head()

d2.iloc[0:2,2]=np.nan #默认索引提取设置值

d2.head(3) #查看前面几行

d2.tail(4) #最后几行

d2.sample(n=4) #随机的抽取

d2.shape #形状

d2.info() #数据类型，缺失值

d2.dtypes #查看数据类型

d2.describe(include= object) #统计信息

索引和过滤

序列的索引和过滤

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

dataframe的索引和过滤

import numpy as np
np.random.seed(10)
df = DataFrame(np.random.rand(6,4)*2-1)
df.index = ['a', 'b', 'c', 'd', 'e', 'f']
df.columns = ['A', 'B', 'C', 'D']

索引

#选择行
df[:1] #选择第一行，默认的索引选择行

df[2:3] #选择多行,左闭右开

df[2::2] #选择多行,左闭右开

df['c':'e'] #选择多行,指定的索引选择多行,两边闭合

df['A'] #选出的是序列

图片.png

df[['A']] #选择一列，dataframe

df.loc['a':'c',['A','D']] #选择多行和多列,指定的索引和列名

df.loc['b':'d':2,'A':'C'] #选择多行和多列，指定范围

iloc方式选择多行和多列，用默认的索引

df.iloc[:,[0,2]]

df.loc[:,df.loc['a']>0]

#选择C列里面大于零的所有行
df['C']>0

#选择a行大于零，b行小于零，A列大于零，B列小于零
df.loc[(df['A']>0)&(df['B']<0),(df.loc['a']>0)&(df.loc['b']<0)]

#A列和C列都是非空的所有行
df1['A'].notnull()&df1['C'].notnull()

#选择E列是two的所有行
df[df['E']=='two']

df[df.E.isin(['two'])]

df[df.E.str.startswith('t')] #优先考虑这种

#选择E列以t开头的所有行
df.E.map(lambda x:x.startswith('t'))

#重新索引和填充

s=Series([3,4,5,6],index=['a','b','c','d']) #指定索引

s.reindex(['a','d','e','f']) #原索引值不在，引入缺失值

s.reindex(['a','d','e','f'],fill_value=0) #原索引值不在，填充0

s=Series(['a','b','c'],index=[0,2,4]) #index为数值，时间

s.reindex(range(6),fill_value='a') #插值，上采样

s.reindex(range(6),method='ffill') #重复操作，前向填充

s.reindex(range(6),method='bfill') #重复操作，后向填充

图片.png

d1.reindex(['a','b','c','d']) #重索引行，引入缺失值

d1.reindex(['a','b','c','d'],method='ffill') #重索引行，填充

d1.reindex(columns=['one','two','three','four'],fill_value=0) #重索引列，引入缺失值

d1.reindex(columns=np.arange(4),method='pad') #重索引列,跟版本的有关

数据清晰

增加和删除

序列的增加和删除

s['e']=5 #增加字典

s.drop('a') #删除

s.drop(['a','c'])#删除多个，新生成

s.drop(['a','c'],inplace=True)#删除多个,原地删除

s.pop('b') #弹出，删除

dataframe 的增加和删除

data1=[[1, 2, 3],[4, 5, 6],[7, 8, 9]] #二维列表创建,二维元组
d1=DataFrame(data1,index=['a','b','c'],columns=['one','two','three'])

图片.png

d1.loc['d']=[7,7,7]# 增加一行，如果原来的行存在则进行修改

d1.loc['e']=9 #标量自动扩展

d1=d1.append(d2) #增加多行，新生成操作

a=[1,2,3]
a.append(4) #原地操作

#删除
d1.drop('a',axis=0) #删除行，默认情况，axis=0

d1.drop('six',axis=1) #删除列

d1.drop(['one','six'],axis=1) #删除多列

d1.drop(['a','b'],axis=0) #删除多行