文本清洗正则表达式（持续更新）

发布时间：2025-01-04 03:26

持续更新和修正自己的文化观念 #生活技巧# #自我提升技巧# #跨文化交际能力#

文章目录常用但记不住的pattern正向肯定预查\w 特殊字符清洗清除连续空白符HTML标签清洗标点格化文本切分是否单词时间&日期网址、邮箱、电话…数量词数学小数、百分比中文数字+阿拉伯数字数词货币带逗号数字物理

常用但记不住的pattern

patterndescription[\u4e00-\u9fa5]中文\s任何空白字符\S任何非空白字符(?=pattern)正向肯定预查(?<=pattern)反向肯定预查(?!pattern)正向否定预查(?<!pattern)反向否定预查\w中英文数字下划线

正向肯定预查

import re rc = re.compile('小米(?=手机)') print(rc.fullmatch('小米手机')) # None print(rc.fullmatch('小米')) # None print(rc.findall('小米手机')) # ['小米'] print(rc.findall('小米粥')) # [] 123456

\w

import re rec = re.compile('\w') a = 'aA1啊の.\n,。_+-=' print(rec.findall(a)) # 日文也属于\w # ['a', 'A', '1', '啊', 'の', '_'] print(list(rec.sub('', a))) # ['.', '\n', ',', '。', '+', '-', '='] 1234567

特殊字符清洗

import re a = '''山aA1１,./<>?";':[]{}\\|`~!@#$%^&*()_+-=《》？，。/：；’‘”“【】、·！@=#=￥%=…（）—❤''' rc = re.compile(r'[^-_a-zA-Z\d\u4e00-\u9fa5\s,<.>/?;:"\[{\]}|`~!@#$%^&*()=+，《。》？；：‘’“”【】、·！￥…（）—]') # 少'\ print(rc.findall(a)) # ['', "'", '\\', '❤'] rc = re.compile(r'[^-\w\s,<.>/?;:\'"\[{\]}\\|`~!@#$%^&*()=+，《。》？；：‘’“”【】、·！￥…（）—]') print(rc.findall(a)) # ['❤'] 1234567891011

清除连续空白符

def replace_continuous_blank_lines(text): """清除连续空行""" return re.sub(r'\n\s*\n', '\n', text.strip()) def replace_space(text): """清除连续空白""" text = re.sub(r'\s*\n\s*', '\n', text.strip()) text = re.sub(r'[^\S\n]', ' ', text) text = re.sub('(?<![\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])|(?<=[\u4e00-\u9fa5]) (?![\u4e00-\u9fa5])', '', text) return text def replace_space_resolutely(text, substitution=''): return re.sub(r'\s+', substitution, text.strip()) 12345678910111213

HTML标签清洗

def replace_tag(html, completely=True): """替换HTML标签""" # 独立元素 html = re.sub('<img[^>]*>', '', html) # 图片 html = re.sub('<br/?>|<br [^<>]*>|<hr/?>|<hr [^<>]*>', '\n', html) # 换行、水平线 html = re.sub('&(nbsp|e[mn]sp|thinsp|zwn?j|#13);', ' ', html) # 空格 html = re.sub(r'\\xa0|\\u3000', ' ', html) # 空格 html = re.sub(r'', '', html) # 注释 html = re.sub(r'<head>[\s\S]*?</head>', '', html) html = re.sub(r'<meta[^<>]*>[\s\S]*?</meta>', '', html) # 元数据 html = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html) # 样式 html = re.sub(r'<script[^<>]*>[\s\S]*?</script>', '', html) # JavaScript html = re.sub(r'<s>[\s\S]*?</s>', '', html) # 删除线（内容也清除） html = re.sub('<input>|<input [^>]*>', '', html) # 输入框（表单中元素） # 行内元素 html = re.sub('<u>|<u [^>]*>|</u>', '', html) # 下划线 underline html = re.sub('<i>|<i [^>]*>|</i>', '', html) # 斜体 italic html = re.sub('<b>|<b [^>]*>|</b>', '', html) # 粗体 html = re.sub('<em>|<em [^>]*>|</em>', '', html) # 强调 emphasize html = re.sub('<strong>|<strong [^>]*>|</strong>', '', html) # 粗体 html = re.sub('<mark>|<mark [^>]*>|</mark>', '', html) # 黄色背景填充标记 html = re.sub('<font>|<font [^>]*>|</font>', '', html) # 字体 html = re.sub('<a>|<a [^>]*>|</a>', '', html) # 超链接 html = re.sub('<span>|<span [^>]*>|</span>', '', html) # span # 块级元素 html = re.sub('<p>|<p [^>]*>|</p>', '\n', html) # 段落 html = re.sub('<h[1-6][^>]*>|</h[1-6]>', '\n', html) # 标题 html = re.sub('<li>|<li [^>]*>|</li>', '\n', html) # 列表 list html = re.sub('<ol>|<ol [^>]*>|</ol>', '\n', html) # 有序列表 ordered list html = re.sub('<ul>|<ul [^>]*>|</ul>', '\n', html) # 无序列表 unordered list html = re.sub('<pre>|<pre [^>]*>|</pre>', '\n', html) # 预格化，可保留连续空白符 html = re.sub('<div>|<div [^>]*>|</div>', '\n', html) # 分割 division html = re.sub('<section[^>]*>|</section>', '\n', html) # 章节 html = re.sub('<form>|<form [^>]*>|</form>', '\n', html) # 表单（用于向服务器传输数据） html = re.sub('<o:p>|<o:p [^>]*>|</o:p>', '\n', html) # OFFICE微软WORD段落 html = re.sub(r'<td[^>]*>([\s\S]*?)</td>', lambda x: ' %s ' % x.group(1), html) # 表格 html = re.sub(r'<tr[^>]*>([\s\S]*?)</tr>', lambda x: '%s\n' % x.group(1), html) # 表格 html = re.sub(r'<th[^>]*>([\s\S]*?)</th>', lambda x: '%s\n' % x.group(1), html) # 表格 html = re.sub(r'<tbody[^>]*>([\s\S]*?)</tbody>', lambda x: '%s\n' % x.group(1), html) # 表格 html = re.sub(r'<table[^>]*>([\s\S]*?)</table>', lambda x: '%s\n' % x.group(1), html) # 表格 # 剩余标签 if completely is True: html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布 html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架 html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html) # 转义字符 html = html.replace('<', '<').replace('>', '>').replace('"', '"').replace('&', '&') return replace_space(html)

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748

标点格化

def digit_cn2en(text): return text.replace('０', '0').replace('１', '1').replace('２', '2').replace('３', '3').replace( '４', '4').replace('５', '5').replace('６', '6').replace('７', '7').replace('８', '8').replace('９', '9') def replace_punctuation(text): """替换标点（英→中）""" text = text.strip().replace('(', '（').replace(')', '）') # 圆括号 text = re.sub(r'\[\]|【】|（）|{}|<>|“”|‘’', '', text) # 清除空括号 text = re.sub('[;；]+', '；', text) # 分号 text = re.sub('[!！]+', '！', text) # 叹号 text = re.sub('[?？]+', '？', text) # 问号 text = re.sub('[.]{3,}|,{3,}|。{3,}|，{3,}|…+', '…', text) # 省略号 text = re.sub('(?<=[\u4e00-\u9fa5]),(?=[\u4e00-\u9fa5])', '，', text) # 逗号 text = re.sub('(?<=[\u4e00-\u9fa5])[.](?=[\u4e00-\u9fa5])', '。', text) # 句号 text = digit_cn2en(text) return text

12345678910111213141516

文本切分

SEP10 = re.compile('[\n。…；;]+|(?<=[\u4e00-\u9fa5])[.]+(?=[\u4e00-\u9fa5])').split SEP15 = re.compile('[\n。…；;!！?？]+|(?<=[a-z\u4e00-\u9fa5])[.]+(?=[a-z\u4e00-\u9fa5])', re.I).split SEP20 = re.compile('[!！?？]+').split SEP30 = re.compile('[,，:：]+').split SEP40 = re.compile(r'\W+').split # 非中英文数字下划线 SEP45 = re.compile('[^a-zA-Z\u4e00-\u9fa5]+').split # 非中英文 SEP50 = re.compile('[^\u4e00-\u9fa5]+').split # 非中文 # sep_cn = re.compile( # '^ ?[0-9]{1,2}、|' # '^ ?[0-9]{1,2}[.](?=[^0-9])|' # '^ ?[一二三四五六七八九十]{1,3}、|' # '^ ?（[一二三四五六七八九十]{1,3}）|' # '^ ?第[一二三四五六七八九十]{1,3}章').split 12345678910111213

是否单词

re_word = re.compile(r'\w*[a-z\u4e00-\u9fa5]\w*', re.I) def is_word(word, min_len=2): if re_word.fullmatch(word) and len(word) >= min_len: return word 12345

时间&日期

时间

re.compile('\d+([天年月日时分秒°]|小时|分钟|毫秒|时辰)[\d天年月日时分秒]*') 1

日期

re.compile('[12]\d{3} ?年 ?[01]?\d ?月 ?[0-3]?\d ?日') # yyyy年mm月dd日 re.compile('[12]\d{3}-[01]?\d-[0-3]?\d|[12]\d{3}/[01]?\d/[0-3]?\d') # yyyy-mm-dd yyyy/mm/dd re_ymd = re.compile( '((19|20)[0-9]{2}年(0?[1-9]|1[012])月(0?[1-9]|[12][0-9]|3[01])日)|' '((19|20)[0-9]{2}-(0?[1-9]|1[012])-(0?[1-9]|[12][0-9]|3[01]))|' '((19|20)[0-9]{2}/(0?[1-9]|1[012])/(0?[1-9]|[12][0-9]|3[01]))') 123456

网址、邮箱、电话…

re_url = re.compile(r'[a-z]+(://|\.)[\w-]+(\.[a-z0-9_-]+)+[0-9a-z!%&()*./:=?_~,@^+#-]*', re.I) re_email = re.compile('[a-z0-9]+@[a-z0-9_-]+([.][0-9a-z!%&()*./:=?_~,@^+#-]+)+', re.I) re_ip = re.compile(r'\d+\.\d+\.\d+\.\d+') re_phone = re.compile('([0-9]{3,4}-|86[+])[0-9]{8,}') # 电话、邮编、编号等 1234

数量词

re_mq = re.compile( '\d[\d.+%/~-]*([a-zA-Z/两个十百千万亿几多' '天年月日时分秒毫厘微纳米公里尺寸' '亩码克斤吨升度元块行列排名号位只架辆部台轮转匹' '种类对包瓶箱盒支根条张片队场届双串头层阵级座' '岁顿件班次集页管间步镑杯粒点枚幅瓦棵卷单款℃￥°π]' # 款 '|小时|分钟|平方|立方|摄氏度|度电|英[寸尺里]|海里|次数|回合' '|[美日欧港]元|美金|块钱|人民币|[港台硬金银]币|[块角分]钱?)+') 12345678

数学

小数、百分比

re.compile('[+-]?([0-9]+|[0-9]+[./~-][0-9]+)[%π]?') 1

1.23 -11.24 +12.5 13.5% -9.5% +1% 1/2 -1/2 +1/2 9.1-13 11~22 11-13% 11~25% 1.2-1.3% 9.1~11% 中文数字+阿拉伯数字

re.compile('(\d+[.]\d+|\d+)[多百千万]+([0-9][多百千万]+)?') 1

2千多万 3千2百多万 4万3千多 2百万多 23万多 13.4万 100万 100千万 4百万数词

re.compile('(大[概约致]|接近|近似)?[0-9][0-9.十百千万亿π]*(左右)?') 1 货币

re.compile( '[0-9][0-9.十百千万亿k]*' '([美日欧港]?元|人民币|rmb|[块角分]钱?|￥|[港台硬金银]币|折扣?)', re.I) 1234 带逗号数字

lambda text: [i[0] for i in re.findall('(\d{1,3}(,\d{3}))+', text)] # 100 1,000 100,000 1,000,000 12

物理

身高

re.compile('(身?高|体?重)?([0-9]+米[0-9]*|[0-9]+[.]?[0-9]+)(厘?米|c?m|公分)?高?') 1

1米7 1.7米 1.7m 170cm 169厘米 168公分身高170 体重52.2 高180 60重

长度

面积

re.compile( '[0-9][0-9.十百千万亿]*' '(平方([千分厘毫]?米|英[里尺寸]|公里)|英?亩|公顷)') 123

体积

re.compile( '[0-9][0-9.十百千万亿]*' '(立方[分厘毫]?米|毫?升|m?l)', re.I) 1234

重量

re.compile( '[0-9][0-9.十百千万亿]*' '([千毫]?克|公?斤|[km]?g|[吨两磅]|盎司|克拉)', re.I) 1234

光、电、热

re.compile( '[0-9][0-9.十百千万亿k]*' '(千瓦时?|度电?|焦耳?|瓦特?|伏特?|kW|库仑|欧姆|安培' '(摄氏)?度|℃|开尔文|摩尔|赫兹|Hz)') 1234

力学

re.compile( '(时速|油耗)?[0-9][0-9.十百千万亿k]*' '(牛[顿米]?|牛[·*]米|[nN]|帕斯卡|米每秒|摩尔?)') # 牛米：扭矩的单位是力的单位和距离的单位的乘积，即牛顿*米 1234

网址：文本清洗正则表达式（持续更新） https://www.yuejiaxmz.com/news/view/639269

⬅️上一篇：兴仁市城南街道行政村生活垃圾清扫

➡️下一篇：河源江东新区投资建设服务中心古竹

文本清洗正则表达式（持续更新）

常用但记不住的pattern

正向肯定预查

\w

特殊字符清洗

清除连续空白符

HTML标签清洗

标点格化

文本切分

是否单词

时间&日期

网址、邮箱、电话…

数量词

数学

物理

相关内容

随便看看

最新动态分享

热点动态分享

专题

推荐动态分享