python 正则表达式使用总结

news2025/7/12 11:10:53

re库使用

re.findall

import re

## 存在匹配的
txt = "ai aiThe rain in Spain"
x = re.findall("ai", txt)
print(x)
# 没有匹配的
txt = "adafda dafasdf"
x = re.findall("ai", txt)
print(x)

在这里插入图片描述

s='中国人adfadsfasfasdfsdaf中国万岁\n'
print(re.findall(r"\w",s))

在这里插入图片描述

import re
print(re.match('www', 'www.runoob.com').span())  # 在起始位置匹配
print(re.match('com', 'www.runoob.com'))         # 不在起始位置匹配

在这里插入图片描述

re.search

import re

txt = "The rain in Spain"
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())

在这里插入图片描述

re.split(对元素进行划分）

import re
txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)
import re
txt = "The rain in Spain"
x = re.split("\s", txt, 1)
print(x)

在这里插入图片描述

re.sub()

import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)
print(x)

在这里插入图片描述

pandas库正则表达式

pandas.str.match(元素匹配）

exampe1

import numpy as np
import pandas as pd
a = np.array(['A0','A1','A2','A3','A4','B0','B1','C0'])
pd.Series(a).str.match(r'A[0-2]')

在这里插入图片描述

example2

 s = pd.Series(['zzzz', 'zzzd', 'zzdd', 'zddd', 'dddn', 'ddnz', 'dnzn', 'nznz',
'znzn', 'nznd', 'zndd', 'nddd', 'ddnn', 'dnnn', 'nnnz', 'nnzn', 'nznn', 'znnn',
'nnnn', 'nnnd', 'nndd', 'dddz', 'ddzn', 'dznn',  'znnz', 'nnzz', 'nzzz', 'zzzn',
                'zznn', 'dddd', 'dnnd'])

#print(s.str.endswith("dd"))
#print("*"*50)
#print(s[s.str.endswith("dd")])
#print("*"*50)
print("*"*50)
print(s.str.match(".*dd$"))
print(s[s.str.match(".*dd$")])

在这里插入图片描述

pandas.str.extract

注意正则表达式里的括号里的内容就是最终返回匹配的内容

example1

import pandas as pd 

ele= ["Toy Story (1995)",
     "GoldenEye (1995)",
    "Four Rooms (1995)",
    "Get Shorty (1995)",
      "Copycat (1995)"]
df = pd.DataFrame({"movie_title":ele})
print(df)

df['just_movie_titles'] = df['movie_title'].str.extract('(.+?) \(')
df

在这里插入图片描述

example 2

import pandas as pd
df = pd.DataFrame({"col1":["1/1/100 'BA1", "1/1/102Packe", "1/1/102 'to_"]})
df["col2"]=df['col1'].str.extract('(\d+/\d+/\d+)', expand=True)
df

结果如下
在这里插入图片描述

example3

# importing pandas as pd 
import pandas as pd 
  
# importing re for regular expressions 
import re 
  
# Creating the Series 
sr = pd.Series(['New_York', 'Lisbon', 'Tokyo', 'Paris', 'Munich']) 
  
# Creating the index 
idx = ['City 1', 'City 2', 'City 3', 'City 4', 'City 5'] 
  
# set the index 
sr.index = idx 
  
# Print the series 
print(sr)
# extract groups having a vowel followed by 
# any character 
result = sr.str.extract(pat = '([aeiou].)') 
  
# print the result 
print(result)

在这里插入图片描述

example4

import pandas as pd 
s = pd.Series(['a1', 'b2', 'c3'])
s.str.extract(r'([ab])(\d)')

在这里插入图片描述
设置expand = True

s.str.extract(r'[ab](\d)', expand=True)

在这里插入图片描述设置新的列名

s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')

在这里插入图片描述

s.str.extract(r'(\d)')

在这里插入图片描述

s.str.extract(r'([ab])')

在这里插入图片描述

pandas.str.split

example1

import pandas as pd
temp = pd.DataFrame({'ticker' : ['spx 5/25/2001 p500', 'spx 5/25/2001 p600', 'spx 5/25/2001 p700']})
temp2 = temp.ticker.str.split(' ')
print(temp2)
temp2.str[-1]

在这里插入图片描述
抽取某一列的部分元素设置为新列

import pandas as pd
df = pd.DataFrame({ 'gene':["1 // foo // blabla",
                                   "2 // bar // lalala",
                                   "3 // qux // trilil",
                                   "4 // woz // hohoho"], 'cell1':[5,9,1,7], 'cell2':[12,90,13,87]})

print(df)

df['gene'] = df['gene'].str.split('//').str[1]
df

结果如下
在这里插入图片描述可以使用pandas.str.extract达到pandas.str.split的同样结果

import pandas as pd
df = pd.DataFrame({ 'gene':["1 // foo // blabla",
                                   "2 // bar // lalala",
                                   "3 // qux // trilil",
                                   "4 // woz // hohoho"], 'cell1':[5,9,1,7], 'cell2':[12,90,13,87]})

print(df)

df["gene"] = df["gene"].str.extract(r"\/\/([a-z ]+)\/\/")
print(df)
df["gene"] = df["gene"].str.strip()
df

结果如下
在这里插入图片描述

example2

import pandas as pd
df = pd.DataFrame({'Scenario':['HI','HI','HI','HI','HI','HI'],
                   'Savings':['Total_FFC_base0','Total_FFC_savings1','Total_FFC_saving2',
                              'Total_FFC_savings3','Total_site_base0','Total_site_savings1'],
                    'PC1':[0.12,0.15,0.12,0.17,0.12,0.15],
                    'PC2':[0.13,0.12,0.14,0.15,0.15,0.15]})
print(df)

df[['Savings', 'EL']] = df['Savings'].str.extract('_(?P<Savings>.*)_.*(?P<EL>\d+)')
df

在这里插入图片描述

import pandas as pd
df = pd.DataFrame({'Scenario':['HI','HI','HI','HI','HI','HI'],
                   'Savings':['Total_FFC_base0','Total_FFC_savings1','Total_FFC_saving2',
                              'Total_FFC_savings3','Total_site_base0','Total_site_savings1'],
                    'PC1':[0.12,0.15,0.12,0.17,0.12,0.15],
                    'PC2':[0.13,0.12,0.14,0.15,0.15,0.15]})
print(df)

df['Savings'].str.extract('(.*)_(.*)_(.*)')

在这里插入图片描述

df['Savings'].str.extract('(.*)_(.*)_(.*)\d')

在这里插入图片描述

df['Savings'].str.extract('(.*)')

在这里插入图片描述

df['Savings'].str.extract(r'(\d+)')
# 匹配的内容都是括号括起来的，括号外面的相当于是标志物，不参与最终的表达结果。

在这里插入图片描述

实例操作1

import numpy as np 
import pandas as pd 

ele = np.array(['CD1C_P14_S91', 'CD1C_P14_S96', 'CD1C_P3_S12', 'CD141_P7_S22',
       'CD141_P7_S24', 'CD1C_P4_S36', 'CD141_P7_S7', 'CD141_P8_S27',
       'CD141_P8_S31', 'CD141_P9_S72', 'pDC_P10_S73', 'pDC_P10_S74',
       'pDC_P10_S83', 'pDC_P13_S56', 'pDC_P13_S59', 'pDC_P13_S70',
       'pDC_P14_S76', 'pDC_P14_S78', 'pDC_P14_S87', 'pDC_P14_S89',
       'pDC_P14_S90', 'pDC_P14_S91', 'pDC_P14_S92', 'pDC_P3_S14',
       'pDC_P3_S16', 'pDC_P3_S17', 'pDC_P3_S18', 'pDC_P3_S1',
       'pDC_P3_S21', 'pDC_P3_S2', 'pDC_P3_S4', 'pDC_P3_S5', 'pDC_P4_S28',
       'pDC_P4_S29', 'pDC_P4_S30', 'pDC_P4_S36', 'pDC_P4_S37',
       'pDC_P4_S40', 'pDC_P4_S42', 'pDC_P4_S43', 'pDC_P4_S45',
       'pDC_P4_S46', 'pDC_P4_S48', 'pDC_P7_S15', 'pDC_P7_S16',
       'pDC_P7_S17', 'pDC_P7_S1', 'pDC_P7_S21', 'pDC_P7_S22', 'pDC_P7_S3',
       'pDC_P7_S7', 'pDC_P8_S26', 'pDC_P8_S28', 'pDC_P8_S32',
       'pDC_P8_S34', 'pDC_P8_S39', 'pDC_P8_S40', 'pDC_P8_S42',
       'pDC_P8_S44', 'pDC_P8_S46', 'pDC_P8_S47', 'pDC_P9_S52',
       'pDC_P9_S54', 'pDC_P9_S61', 'pDC_P9_S63', 'pDC_P9_S65',
       'pDC_P9_S71', 'DoubleNeg_P10_S73', 'DoubleNeg_P10_S76',
       'DoubleNeg_P10_S79', 'DoubleNeg_P10_S80', 'DoubleNeg_P10_S81',
       'DoubleNeg_P10_S84', 'DoubleNeg_P10_S86', 'DoubleNeg_P13_S49',
       'DoubleNeg_P13_S53', 'DoubleNeg_P13_S64', 'DoubleNeg_P13_S67',
       'DoubleNeg_P14_S74', 'DoubleNeg_P14_S78', 'DoubleNeg_P14_S81',
       'DoubleNeg_P14_S82', 'DoubleNeg_P14_S83', 'DoubleNeg_P14_S87',
       'DoubleNeg_P14_S90', 'DoubleNeg_P14_S92', 'DoubleNeg_P14_S95',
       'DoubleNeg_P3_S1', 'DoubleNeg_P3_S20', 'DoubleNeg_P3_S24',
       'DoubleNeg_P3_S3', 'DoubleNeg_P3_S5', 'DoubleNeg_P3_S7',
       'DoubleNeg_P4_S29', 'DoubleNeg_P4_S30', 'DoubleNeg_P4_S35',
       'DoubleNeg_P4_S39', 'DoubleNeg_P4_S42', 'DoubleNeg_P4_S45',
       'DoubleNeg_P4_S46', 'DoubleNeg_P7_S11', 'DoubleNeg_P7_S13',
       'DoubleNeg_P7_S14', 'DoubleNeg_P7_S16', 'DoubleNeg_P7_S24',
       'DoubleNeg_P7_S2', 'DoubleNeg_P7_S3', 'DoubleNeg_P7_S5',
       'DoubleNeg_P7_S7', 'DoubleNeg_P7_S8', 'DoubleNeg_P8_S25',
       'DoubleNeg_P8_S30', 'DoubleNeg_P8_S38', 'DoubleNeg_P8_S41',
       'DoubleNeg_P8_S42', 'DoubleNeg_P8_S43', 'DoubleNeg_P8_S44',
       'DoubleNeg_P9_S64', 'DoubleNeg_P9_S66', 'CD1C_P13_S57',
       'CD1C_P13_S63', 'CD1C_P14_S85'])
               
df = pd.DataFrame({"cell":ele})
df

在这里插入图片描述

测试1(仅仅抽取大写字母）

df["cell"].str.extract(r"([A-Z]+)")

在这里插入图片描述

测试2(抽取大写字母和小写字母)

df["cell"].str.extract(r"([A-Za-z]+)")

在这里插入图片描述

测试3(联合使用）

df["cell"].str.extract(r"([A-Za-z]+\d+[A-Za-z]+)")
# CD141不符合,注意这个NaN值

在这里插入图片描述

测试4（使用split）

print(df["cell"].str.split("_").str[0])
print(df["cell"].str.split("_").str[0].value_counts())

在这里插入图片描述

测试5（使用正则表达式）

# [a-zA-Z0-9] 判断字母和数字
print(df["cell"].str.extract(r"([a-zA-Z0-9]+)"))
print(df["cell"].str.extract(r"([a-zA-Z0-9]+)").value_counts())

在这里插入图片描述

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.coloradmin.cn/o/561530.html

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈，一经查实，立即删除！

python 正则表达式使用总结

re库使用

re.findall

re.search

re.split(对元素进行划分）

re.sub()

pandas库正则表达式

pandas.str.match(元素匹配）

exampe1

example2

pandas.str.extract

example1

example 2

example3

example4

pandas.str.split

example1

example2

实例操作1

测试1(仅仅抽取大写字母）

测试2(抽取大写字母和小写字母)

测试3(联合使用）

测试4（使用split）

测试5（使用正则表达式）

相关文章