Pandas 字符串处理_CDA答疑社区

向量化字符串操作基础

计算长度
import pandas as pd
import numpy as np
s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])

s.str.len() #字符串长度

s.str.lower() #转化成小写

s.str.upper() #转化成大写

s.str.title() #将每个单词的首字母大小，把其他的小写

s.str.capitalize() #将字符串的首字母大写，其他小写

s.str.swapcase() #大写变小写，小写变大写

字符检索

s = pd.Series(['a_b_c','c_ed_e',np.nan,'f_g_h'])

s.str.get(2) # get() 获取指定位置的字符串,只能获取单个

图片.png

s.str.get(-1)# get() 获取指定位置的字符串

find和index

s.str.find('e') #找索引，找不到返回-1，从左到右

图片.png

s.str.index('') #s.str.index 返回的是索引,找不到报错

s.str.index('_') #s.str.index 返回的是索引,找不到报错

字符的提取 findall

s = pd.Series(['a47835798','c_e46765d_e',np.nan,'f_g5785_h'])

s.str.findall("[a,b,c,d]") #findall() 查找所有符合正则表达式的字符，以列表形式返回

图片.png

s.str.findall("[0-9]")#提取数字

图片.png

#extract() 抽取匹配的字符串出来，注意要加上括号，把你需要抽取的东西标注上
s.str.extract("([a-d])") #找首次出现的

s.str.match("[a-d]")#match() 检测是否全部匹配给点的字符串或者表达式

字符类型判断

s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe','21',' ','12a',])

s.str.islower()

s.str.isupper()

s.str.isspace()

字符判断

import numpy as np
s = pd.Series(['bat', 'Bear', 'cat', np.nan])
s.str.startswith('b')

图片.png

s.str.startswith('b', na=False)

图片.png

s.str.endswith('t', na=False)

图片.png

s.str.contains('t') #contains() 是否包含表达式

图片.png

字符对齐与填充

import numpy as np
s = pd.Series(['bat', 'Bear', 'cat', np.nan])
s.str.ljust(10,"*")

图片.png

s.str.center(10,"*")

图片.png

s = pd.Series(['bat', 'Bear', 'cat', np.nan])
s.str.pad(10, side="left",fillchar="*")#pad() 左补齐

图片.png

s.str.pad(10, side="right", fillchar="*")#右补齐

图片.png

s.str.zfill(10) #zfill() 左边补0

图片.png

字符整理

s = pd.Series([' bat ', ' Bear * ', 'cat * ', np.nan])
s.values

s.str.strip().str.strip('*').str.strip().values #嵌套,一层一层的处理方法1

s.str.strip('* ').values #去除多个，交替去除，最后结果方法2

替换

pd.Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') #替换

pd.Series(['f1o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) #替换，开启正则表达式模式

pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) #替换

字符分割

s = pd.Series(['a_b_c_d','c_d_e',np.nan,'f_g_h'])

s.str.split('_') #split() 切分字符串

s.str.split('_',2) #用几个空格分,分几次

s.str.slice(1,3) #slice() 按给点的开始结束位置切割字符串，切片，左闭右开

s.str.slice_replace(1, 3, "*")
#slice_replace() 使用给定的字符串，替换指定的位置的字符

图片.png

s = pd.Series(['Linda van-der Berg', 'George Pitt-Rivers'])
s.str.partition() #partition() 把字符串数组切割称为DataFrame，注意切割只是切割称为三部分，分隔符前，分隔符，分隔符后

图片.png

s.str.partition('-')

图片.png

拼接

s = pd.Series(['a', 'b', np.nan, 'd'])

s.str.cat(sep=' ')

图片.png

s1 = pd.Series(['a', 'b', np.nan, 'd'])
s2 = pd.Series(['A', 'B', 'C', 'D'])

s1.str.cat(s2, sep=',') #cat() 拼接字符串

图片.png

s1.str.cat(s2, sep=',',na_rep='-') #cat() 拼接字符串

图片.png

t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
s.str.cat(t, join='left', na_rep='-')

图片.png

s.str.cat(t, join='inner', na_rep='-')

图片.png

s.str.join('-') #join() 对每个字符都用给定的字符串拼接起,元素必须是字符串

重复

s = pd.Series(['a', 'b', 'c'])

s.str.repeat(repeats=2) #repeat() 重复

图片.png

s.str.repeat(repeats=[1, 2, 3])

图片.png

统计

s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaaSe','21',' ','12a',])

s.str.count("a") #count() 计算给定字符出现的次数