pandas数据结构：DataFrame-CDA数据分析师官网

热线电话：13121318867

pandas数据结构：DataFrame

2020-06-12

刚刚接触pandas的朋友，想了解数据结构，就一定要认识DataFrame，接下来给大家详细介绍！

初识pandas数据结构：DataFrame

import numpy as np
import pandas as pd

data = {"name": ["Jack", "Tom", "LiSa"],
        "age": [20, 21, 18],
        "city": ["BeiJing", "TianJin", "ShenZhen"]}
print(data)
print("")

frame = pd.DataFrame(data)  # 创建DataFrame
print(frame)
print("")

print(frame.index)  # 查看行索引
print("")

print(frame.columns)  # 查看列索引
print("")

print(frame.values)  # 查看值

{'name': ['Jack', 'Tom', 'LiSa'], 'age': [20, 21, 18], 'city': ['BeiJing', 'TianJin', 'ShenZhen']}

   age      city  name
0   20   BeiJing  Jack
1   21   TianJin   Tom
2   18  ShenZhen  LiSa

RangeIndex(start=0, stop=3, step=1)

Index(['age', 'city', 'name'], dtype='object')

[[20 'BeiJing' 'Jack']
 [21 'TianJin' 'Tom']
 [18 'ShenZhen' 'LiSa']]

创建DataFrame

方法一: 由字典创建字典的key是列索引值可以是

1.列表

2.ndarray

3.Series

# 值是ndarray  注意: 用ndarray创建DataFrame值的个数必须相同 否则报错

data2 = {"one": np.random.rand(3),
         "two": np.random.rand(3)
        }
print(data2)
print("")

print(pd.DataFrame(data2))

{'one': array([ 0.60720023,  0.30838024,  0.30678266]), 'two': array([ 0.21368784,  0.03797809,  0.41698718])}

        one       two
0  0.607200  0.213688
1  0.308380  0.037978
2  0.306783  0.416987

# 值是Series--带有标签的一维数组  注意: 用Series创建DataFrame值的个数可以不同  少的值用Nan填充

data3 = {"one": pd.Series(np.random.rand(4)),
         "two": pd.Series(np.random.rand(5))
        }
print(data3)
print("")

df3 = pd.DataFrame(data3)
print(df3)
print("")

{'one': 0    0.217639
1    0.921641
2    0.898810
3    0.933510
dtype: float64, 'two': 0    0.132789
1    0.099904
2    0.723495
3    0.719173
4    0.477456
dtype: float64}

        one       two
0  0.217639  0.132789
1  0.921641  0.099904
2  0.898810  0.723495
3  0.933510  0.719173
4       NaN  0.477456

# 值是Series--带有标签的一维数组  注意: 用Series创建DataFrame值的个数可以不同  少的值用Nan填充

data3 = {"one": pd.Series(np.random.rand(4)),
         "two": pd.Series(np.random.rand(5))
        }
print(data3)
print("")

df3 = pd.DataFrame(data3)
print(df3)
print("")

{'one': 0    0.217639
1    0.921641
2    0.898810
3    0.933510
dtype: float64, 'two': 0    0.132789
1    0.099904
2    0.723495
3    0.719173
4    0.477456
dtype: float64}

        one       two
0  0.217639  0.132789
1  0.921641  0.099904
2  0.898810  0.723495
3  0.933510  0.719173
4       NaN  0.477456

方法二: 通过二维数组直接创建

data = [{"one": 1, "two": 2}, {"one": 5, "two": 10, "three": 15}]  # 每一个字典在DataFrame里就是一行数据
print(data)
print("")

df1 = pd.DataFrame(data)
print(df1)
print("")

df2 = pd.DataFrame(data, index=list("ab"), columns=["one", "two", "three", "four"])
print(df2)

[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 15}]

   one  three  two
0    1    NaN    2
1    5   15.0   10

   one  two  three  four
a    1    2    NaN   NaN
b    5   10   15.0   NaN

方法三: 由字典组成的列表创建 DataFrame

# columns为字典的key index为子字典的key

data = {"Jack": {"age":1, "country":"China", "sex":"man"}, 
        "LiSa": {"age":18, "country":"America", "sex":"women"},
        "Tom": {"age":20, "country":"English"}}

df1 = pd.DataFrame(data)
print(df1)
print("")

# 注意: 这里的index并不能给子字典的key(行索引)重新命名 但可以给子字典的key重新排序 若出现原数组没有的index 那么就填充NaN值

df2 = pd.DataFrame(data, index=["sex", "age", "country"])
print(df2)
print("")

df3 = pd.DataFrame(data, index=list("abc"))
print(df3)
print("")

# columns 给列索引重新排序 若出现原数组没有的列索引填充NaN值
df4 = pd.DataFrame(data, columns=["Tom", "LiSa", "Jack", "TangMu"])
print(df4)

          Jack     LiSa      Tom
age          1       18       20
country  China  America  English
sex        man    women      NaN

          Jack     LiSa      Tom
sex        man    women      NaN
age          1       18       20
country  China  America  English

   Jack  LiSa  Tom
a   NaN   NaN  NaN
b   NaN   NaN  NaN
c   NaN   NaN  NaN

             Tom     LiSa   Jack TangMu
age           20       18      1    NaN
country  English  America  China    NaN
sex          NaN    women    man    NaN

方法四: 由字典组成的字典

# columns为字典的key index为子字典的key

data = {"Jack": {"age":1, "country":"China", "sex":"man"}, 
        "LiSa": {"age":18, "country":"America", "sex":"women"},
        "Tom": {"age":20, "country":"English"}}

df1 = pd.DataFrame(data)
print(df1)
print("")

# 注意: 这里的index并不能给子字典的key(行索引)重新命名 但可以给子字典的key重新排序 若出现原数组没有的index 那么就填充NaN值

df2 = pd.DataFrame(data, index=["sex", "age", "country"])
print(df2)
print("")

df3 = pd.DataFrame(data, index=list("abc"))
print(df3)
print("")

# columns 给列索引重新排序 若出现原数组没有的列索引填充NaN值
df4 = pd.DataFrame(data, columns=["Tom", "LiSa", "Jack", "TangMu"])
print(df4)

          Jack     LiSa      Tom
age          1       18       20
country  China  America  English
sex        man    women      NaN

          Jack     LiSa      Tom
sex        man    women      NaN
age          1       18       20
country  China  America  English

   Jack  LiSa  Tom
a   NaN   NaN  NaN
b   NaN   NaN  NaN
c   NaN   NaN  NaN

             Tom     LiSa   Jack TangMu
age           20       18      1    NaN
country  English  America  China    NaN
sex          NaN    women    man    NaN

DataFrame索引

选择行与列

选择列直接用df["列标签"]

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                 index = ["one", "two", "three"], columns = ["a", "b", "c", "d"])
print(df)
print("")

print(df["a"], "  ", type(df["a"]))  # 取一列
print("")

print(df[["a", "c"]], "  ", type(df[["a", "c"]]))  # 取多列

               a          b          c          d
one    92.905464  11.630358  19.518051  77.417377
two    91.107357   0.641600   4.913662  65.593182
three   3.152801  42.324671  14.030304  22.138608

one      92.905464
two      91.107357
three     3.152801
Name: a, dtype: float64    pandas.core.series.series'="">

               a          c
one    92.905464  19.518051
two    91.107357   4.913662
three   3.152801  14.030304    pandas.core.frame.dataframe'="">

选择行不能通过标签索引 df["one"] 来选择行要用 df.loc["one"], loc就是针对行来操作的

print(df)
print("")

print(df.loc["one"], " ", type(df.loc["one"]))  # 取一行
print("")

print(df.loc[["one", "three"]], " ", type(df.loc[["one", "three"]])) # 取不连续的多行
print("")

               a          b          c          d
one    92.905464  11.630358  19.518051  77.417377
two    91.107357   0.641600   4.913662  65.593182
three   3.152801  42.324671  14.030304  22.138608

a    92.905464
b    11.630358
c    19.518051
d    77.417377
Name: one, dtype: float64   pandas.core.series.series'="">

               a          b          c          d
one    92.905464  11.630358  19.518051  77.417377
three   3.152801  42.324671  14.030304  22.138608   pandas.core.frame.dataframe'="">

loc支持切片索引--针对行并包含末端 df.loc["one": "three"]

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.loc["one": "three"])
print("") 

print(df[: 3])  # 切片表示取连续的多行(尽量不用 免得混淆)

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401
four   45.591798  63.274956  74.056045   2.466652

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401

iloc也是对行来操作的只不过把行标签改成了行索引并且是不包含末端的

print(df)
print("")

print(df.iloc[0])  # 取一行
print("")

print(df.iloc[[0,2]])  # 取不连续的多行
print("")

print(df.iloc[0:3])  # 不包含末端

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401
four   45.591798  63.274956  74.056045   2.466652

a    65.471894
b    19.137274
c    31.680635
d    41.659808
Name: one, dtype: float64

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
three  54.930986  68.232707  17.215544  70.765401

               a          b          c          d
one    65.471894  19.137274  31.680635  41.659808
two    31.570587  45.575849  37.739644   5.140845
three  54.930986  68.232707  17.215544  70.765401

布尔型索引

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"])
print(df)
print("")

d1 = df >50  # d1为布尔型索引
print(d1)
print("")

print(df[d1])  # df根据d1 只返回True的值  False的值对应为NaN
print("")

               a          b          c          d
one    91.503673  74.080822  85.274682  80.788609
two    49.670055  42.221393  36.674490  69.272958
three  78.349843  68.090150  22.326223  93.984369
four   79.057146  77.687246  32.304265   0.567816

           a      b      c      d
one     True   True   True   True
two    False  False  False   True
three   True   True  False   True
four    True   True  False  False

               a          b          c          d
one    91.503673  74.080822  85.274682  80.788609
two          NaN        NaN        NaN  69.272958
three  78.349843  68.090150        NaN  93.984369
four   79.057146  77.687246        NaN        NaN

选取某一列作为布尔型索引返回True所在行的所有列注意: 不能选取多列作为布尔型索引

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"], dtype=np.int64)
print(df)
print("") 

d2 = df["b"] > 50  
print(d2)
print("")

print(df[d2])

        a   b   c   d
one    27  18  47  61
two    26  35  16  78
three  80  98  94  41
four   85   3  47  90

one      False
two      False
three     True
four     False
Name: b, dtype: bool

        a   b   c   d
three  80  98  94  41

选取多列作为布尔型索引返回True所对应的值 False对应为NaN 没有的列全部填充为NaN

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100, index=["one", "two", "three", "four"],
                 columns=["a", "b", "c", "d"], dtype=np.int64)
print(df)
print("")

d3 = df[["a", "c"]] > 50
print(d3)
print("")

print(df[d3])

        a   b   c   d
one    49  82  32  39
two    78   2  24  84
three   6  84  84  69
four   21  89  16  77

           a      c
one    False  False
two     True  False
three  False   True
four   False  False

          a   b     c   d
one     NaN NaN   NaN NaN
two    78.0 NaN   NaN NaN
three   NaN NaN  84.0 NaN
four    NaN NaN   NaN NaN

多重索引

print(df)

        a   b   c   d
one    49  82  32  39
two    78   2  24  84
three   6  84  84  69
four   21  89  16  77

print(df["a"].loc[["one", "three"]])  # 取列再取行
print("")

print(df[["a", "c"]].iloc[0:3])

one      49
three     6
Name: a, dtype: int64

        a   c
one    49  32
two    78  24
three   6  84

print(df.loc[["one", "three"]][["a", "c"]])  # 取行再取列

        a   c
one    49  32
three   6  84

print(df > 50)
print("")

print(df[df>50])
print("")

print(df[df>50][["a","b"]])

           a      b      c      d
one    False   True  False  False
two     True  False  False   True
three  False   True   True   True
four   False   True  False   True

          a     b     c     d
one     NaN  82.0   NaN   NaN
two    78.0   NaN   NaN  84.0
three   NaN  84.0  84.0  69.0
four    NaN  89.0   NaN  77.0

          a     b
one     NaN  82.0
two    78.0   NaN
three   NaN  84.0
four    NaN  89.0

DataFrame基本技巧

import numpy as np
import pandas as pd

arr = np.random.rand(16).reshape(8, 2)*10
# print(arr)
print("")

print(len(arr))
print("")

df = pd.DataFrame(arr, index=[chr(i) for i in range(97, 97+len(arr))], columns=["one", "two"])
print(df)

8

        one       two
a  2.129959  1.827002
b  8.631212  0.423903
c  6.262012  3.851107
d  6.890305  9.543065
e  6.883742  3.643955
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

查看数据

print(df)
print("")

print(df.head(2))  # 查看头部数据 默认查看5条
print("")

print(df.tail(3))  # 查看末尾数据 默认查看5条

        one       two
a  2.129959  1.827002
b  8.631212  0.423903
c  6.262012  3.851107
d  6.890305  9.543065
e  6.883742  3.643955
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

        one       two
a  2.129959  1.827002
b  8.631212  0.423903

        one       two
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

转置

print(df)

        one       two
a  2.129959  1.827002
b  8.631212  0.423903
c  6.262012  3.851107
d  6.890305  9.543065
e  6.883742  3.643955
f  2.740878  6.851490
g  6.242513  7.402237
h  9.226572  3.179664

print(df.T)

            a         b         c         d         e         f         g  \
one  2.129959  8.631212  6.262012  6.890305  6.883742  2.740878  6.242513   
two  1.827002  0.423903  3.851107  9.543065  3.643955  6.851490  7.402237   

            h  
one  9.226572  
two  3.179664

添加与修改

df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

df.loc["five"] = 100  # 增加一行
print(df)
print("")

df["e"] = 10  # 增加一列
print(df)
print("")

df["e"] = 101  # 修改一列
print(df)
print("")

df.loc["five"] = 111  # 修改一行
print(df)
print("")

              a         b         c         d
one    0.708481  0.285426  0.355058  0.990070
two    0.199559  0.733047  0.322982  0.791169
three  0.198043  0.801163  0.356082  0.857501
four   0.430182  0.020549  0.896011  0.503088

                a           b           c           d
one      0.708481    0.285426    0.355058    0.990070
two      0.199559    0.733047    0.322982    0.791169
three    0.198043    0.801163    0.356082    0.857501
four     0.430182    0.020549    0.896011    0.503088
five   100.000000  100.000000  100.000000  100.000000

                a           b           c           d   e
one      0.708481    0.285426    0.355058    0.990070  10
two      0.199559    0.733047    0.322982    0.791169  10
three    0.198043    0.801163    0.356082    0.857501  10
four     0.430182    0.020549    0.896011    0.503088  10
five   100.000000  100.000000  100.000000  100.000000  10

                a           b           c           d    e
one      0.708481    0.285426    0.355058    0.990070  101
two      0.199559    0.733047    0.322982    0.791169  101
three    0.198043    0.801163    0.356082    0.857501  101
four     0.430182    0.020549    0.896011    0.503088  101
five   100.000000  100.000000  100.000000  100.000000  101

                a           b           c           d    e
one      0.708481    0.285426    0.355058    0.990070  101
two      0.199559    0.733047    0.322982    0.791169  101
three    0.198043    0.801163    0.356082    0.857501  101
four     0.430182    0.020549    0.896011    0.503088  101
five   111.000000  111.000000  111.000000  111.000000  111

删除 del(删除行)/drop(删除列指定axis=1删除行)

df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

del df["a"]  # 删除列  改变原数组
print(df)

              a         b         c         d
one    0.339979  0.577661  0.108308  0.482164
two    0.374043  0.102067  0.660970  0.786986
three  0.384832  0.076563  0.529472  0.358780
four   0.938592  0.852895  0.466709  0.938307

              b         c         d
one    0.577661  0.108308  0.482164
two    0.102067  0.660970  0.786986
three  0.076563  0.529472  0.358780
four   0.852895  0.466709  0.938307

df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

d1 = df.drop("one")  # 删除行 并返回新的数组 不改变原数组
print(d1)
print("")

print(df)

              a         b         c         d
one    0.205438  0.324132  0.401131  0.368300
two    0.471426  0.671785  0.837956  0.097416
three  0.888816  0.451950  0.137032  0.568844
four   0.524813  0.448306  0.875787  0.479477

              a         b         c         d
two    0.471426  0.671785  0.837956  0.097416
three  0.888816  0.451950  0.137032  0.568844
four   0.524813  0.448306  0.875787  0.479477

              a         b         c         d
one    0.205438  0.324132  0.401131  0.368300
two    0.471426  0.671785  0.837956  0.097416
three  0.888816  0.451950  0.137032  0.568844
four   0.524813  0.448306  0.875787  0.479477

df = pd.DataFrame(np.random.rand(16).reshape(4,4),index=["one", "two", "three", "four"], columns=["a", "b", "c", "d"])
print(df)
print("")

d2 = df.drop("a", axis=1)  # 删除列 返回新的数组 不会改变原数组
print(d2)
print("")

print(df)

              a         b         c         d
one    0.939552  0.613218  0.357056  0.534264
two    0.110583  0.602123  0.990186  0.149132
three  0.756016  0.897848  0.176100  0.204789
four   0.655573  0.819009  0.094322  0.656406

              b         c         d
one    0.613218  0.357056  0.534264
two    0.602123  0.990186  0.149132
three  0.897848  0.176100  0.204789
four   0.819009  0.094322  0.656406

              a         b         c         d
one    0.939552  0.613218  0.357056  0.534264
two    0.110583  0.602123  0.990186  0.149132
three  0.756016  0.897848  0.176100  0.204789
four   0.655573  0.819009  0.094322  0.656406

排序

根据指定列的列值排序同时列值所在的行也会跟着移动 .sort_values(['列'])

# 单列

df = pd.DataFrame(np.random.rand(16).reshape(4,4), columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_values(['a']))  # 默认升序  
print("")

print(df.sort_values(['a'], ascending=False))  # 降序

          a         b         c         d
0  0.616386  0.416094  0.072445  0.140167
1  0.263227  0.079205  0.520708  0.866316
2  0.665673  0.836688  0.733966  0.310229
3  0.405777  0.090530  0.991211  0.712312

          a         b         c         d
1  0.263227  0.079205  0.520708  0.866316
3  0.405777  0.090530  0.991211  0.712312
0  0.616386  0.416094  0.072445  0.140167
2  0.665673  0.836688  0.733966  0.310229

          a         b         c         d
2  0.665673  0.836688  0.733966  0.310229
0  0.616386  0.416094  0.072445  0.140167
3  0.405777  0.090530  0.991211  0.712312
1  0.263227  0.079205  0.520708  0.866316

根据索引排序 .sort_index()

df = pd.DataFrame(np.random.rand(16).reshape(4,4), index=[2,1,3,0], columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_index())  # 默认升序
print("")

print(df.sort_index(ascending=False))  # 降序

          a         b         c         d
2  0.669311  0.118176  0.635512  0.248388
1  0.752321  0.935779  0.572554  0.274019
3  0.701334  0.354684  0.592998  0.402686
0  0.548317  0.966295  0.191219  0.307908

          a         b         c         d
0  0.548317  0.966295  0.191219  0.307908
1  0.752321  0.935779  0.572554  0.274019
2  0.669311  0.118176  0.635512  0.248388
3  0.701334  0.354684  0.592998  0.402686

          a         b         c         d
3  0.701334  0.354684  0.592998  0.402686
2  0.669311  0.118176  0.635512  0.248388
1  0.752321  0.935779  0.572554  0.274019
0  0.548317  0.966295  0.191219  0.307908

df = pd.DataFrame(np.random.rand(16).reshape(4,4), index=["x", "z", "y", "t"], columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_index())  # 根据字母顺序表排序

          a         b         c         d
x  0.717421  0.206383  0.757656  0.720580
z  0.969988  0.551812  0.210200  0.083031
y  0.956637  0.759216  0.350744  0.335287
t  0.846718  0.207411  0.936231  0.891330

          a         b         c         d
t  0.846718  0.207411  0.936231  0.891330
x  0.717421  0.206383  0.757656  0.720580
y  0.956637  0.759216  0.350744  0.335287
z  0.969988  0.551812  0.210200  0.083031

df = pd.DataFrame(np.random.rand(16).reshape(4,4), index=["three", "one", "four", "two"], columns=["a", "b", "c", "d"])
print(df)
print("")

print(df.sort_index())  # 根据单词首字母排序

              a         b         c         d
three  0.173818  0.902347  0.106037  0.303450
one    0.591793  0.526785  0.101916  0.884698
four   0.685250  0.364044  0.932338  0.668774
two    0.240763  0.260322  0.722891  0.634825

              a         b         c         d
four   0.685250  0.364044  0.932338  0.668774
one    0.591793  0.526785  0.101916  0.884698
three  0.173818  0.902347  0.106037  0.303450
two    0.240763  0.260322  0.722891  0.634825

CDA数据分析师考试相关入口一览（建议收藏）：

▷ 想报名CDA认证考试，点击>>> “CDA报名” 了解CDA考试详情；