【自然语言处理数据清洗】清洗文本中html标签

本文主要是介绍【自然语言处理数据清洗】清洗文本中html标签，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

一段本文中既有文字，又有很多html标签，很乱，需要进行清洗，下面是用python 进行过滤辣鸡html的脚本。

# -*- coding:utf-8 -*-import pandas as pd
import reimport jiebadef filter_tags(htmlstr):"""# Python通过正则表达式去除(过滤)HTML标签:param htmlstr::return:"""# 先过滤CDATAre_cdata = re.compile('//<!\CDATA\[[ >]∗ //\CDATA\[[ >]∗ //\\] > ',re.I) #匹配CDATAre_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)# Scriptre_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)# stylere_br = re.compile('<br\s*?/?>')# 处理换行re_h = re.compile('</?\w+[^>]*>')# HTML标签re_comment = re.compile('<!--[^>]*-->')# HTML注释s = re_cdata.sub('', htmlstr)# 去掉CDATAs = re_script.sub('', s)  # 去掉SCRIPTs = re_style.sub('', s)# 去掉styles = re_br.sub('\n', s)# 将br转换为换行s = re_h.sub('', s)  # 去掉HTML 标签s = re_comment.sub('', s)# 去掉HTML注释# 去掉多余的空行blank_line = re.compile('\n+')s = blank_line.sub('\n', s)s = replaceCharEntity(s)  # 替换实体return sdef replaceCharEntity(htmlstr):""":param htmlstr:HTML字符串:function:过滤HTML中的标签"""CHAR_ENTITIES = {'nbsp': ' ', '160': ' ','lt': '<', '60': '<','gt': '>', '62': '>','amp': '&', '38': '&','quot': '"', '34': '"', }re_charEntity = re.compile(r'&#?(?P<name>\w+);')sz = re_charEntity.search(htmlstr)while sz:entity = sz.group()  # entity全称，如>key = sz.group('name')  # 去除&;后entity,如>为gttry:htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)sz = re_charEntity.search(htmlstr)except KeyError:# 以空串代替htmlstr = re_charEntity.sub('', htmlstr, 1)sz = re_charEntity.search(htmlstr)return htmlstrdef repalce(s, re_exp, repl_string):return re_exp.sub(repl_string,s)def Cleaning_data(x):m2=str(x).replace('<p>&nbsp; &nbsp; &nbsp; &nbsp;','').replace('</p><p><br></p>','').replace('<br>','').replace('</p>','').replace('<p>','').replace('       ','').replace('[图片]','').strip()m3=filter_tags(m2)m4=replaceCharEntity(m3)print(m4)if __name__ == '__main__':# 读取数据data = pd.read_csv('C:\\Users\\xiaohu\\Desktop\\香蕉球用户话题\\香蕉球用户话题.csv')# print(data)for each in data.iloc[:,3]:# print(each)Cleaning_data(each)