Python移除Html的sytyle属性(remove attributes from HTML tags

清洗爬虫数据的时候可能会需要去掉很多 word 软件生成的 html 属性。

以下代码在 python3.6 环境下测试通过。

import lxml.html.clean as clean
# 需要保留的 html 属性
safe_attrs = set(['src', 'href', 'colspan', 'rowspan'])
cleaner = clean.Cleaner(safe_attrs=safe_attrs)
html_string = "var desc='<p><span style=\"color: #000000;\"></span></p><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;line-height: 30.0pt;\"><span style=\"color: #000000;\"><br></span></p><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 22.0pt;\"> 拍卖财产信息表 </span></p><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;line-height: 30.0pt;\"><span style=\"color: #000000;\"><br></span></p><p><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\">  </span><span style=\"color: #000000;\"> </span><span style=\"color: #000000;\"></span></p><table width=\"586\" style=\"margin: auto auto auto -8.8pt;border: currentcolor;width: 439.45pt;border-collapse: collapse;\" border=\"1\" cellspacing=\"0\" cellpadding=\"0\"><tr><td width=\"95\" style=\"padding: 0.0cm 5.4pt;border: 1.0pt solid black;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 拍卖财产 </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 名称 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" style=\"border-width: 1.0pt 1.0pt 1.0pt 0.0px;border-style: solid solid solid none;border-color: black black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 15.0pt;\"> 云南省安宁市太平镇始甸村委会新邑村民小组 </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 15.0pt;\"> 国有出让城镇单一住宅土地 </span></p><span style=\"color: #000000;\">  </span></td></tr><tr><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 权证 </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 情况 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 15.0pt;\"> 土地证号:安国用(<span>2008</span>)第 <span>0529</span> 号 </span></p><span style=\"color: #000000;\">  </span></td></tr><tr><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 权利限 </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 制情况 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: gray;font-size: 15.0pt;\"> 已查封,抵押于峨山县农村信用合作联社 </span></p><span style=\"color: #000000;\">  </span></td></tr><tr><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 评估价 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"180\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 134.7pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;\"><span style=\"font-size: 15.0pt;\">11791261</span><span style=\"font-size: 15.0pt;\"> 元 </span></span></p><span style=\"color: #000000;\">  </span></td><td width=\"161\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 120.45pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"font-size: 15.0pt;\"><span style=\"color: #000000;\"> 起拍价 <span>(</span></span><span style=\"color: #000000;\"> 保留价 </span><span style=\"undefinedcolor: #000000;\">)</span></span></p><span style=\"color: #000000;\">  </span></td><td width=\"151\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 4.0cm;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;\"><span style=\"font-size: 15.0pt;\">11791261</span><span style=\"font-size: 15.0pt;\"> 元 </span></span></p><span style=\"color: #000000;\">  </span></td></tr><tr><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 保证金 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"180\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 134.7pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 12.0pt;\">59</span><span style=\"color: #7f7f7f;font-size: 12.0pt;\"> 万元(起拍价的 <span>5</span></span><span style=\"color: #7f7f7f;font-size: 12.0pt;\">﹪</span><span style=\"color: #7f7f7f;font-size: 12.0pt;\">)</span></p><span style=\"color: #000000;\">  </span></td><td width=\"161\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 120.45pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 增价幅度 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"151\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 4.0cm;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 12.0pt;\">5.8</span><span style=\"color: #7f7f7f;font-size: 12.0pt;\"> 万元(起拍价的 <span>0.5</span></span><span style=\"color: #7f7f7f;font-size: 12.0pt;\">﹪</span><span style=\"color: #7f7f7f;font-size: 12.0pt;\">)</span></p><span style=\"color: #000000;\">  </span></td></tr><tr><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 看样 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 15.0pt;\"> 联系人:杨律师,联系电话:<span>13987790662</span></span></p><span style=\"color: #000000;\">  </span></td></tr><tr style=\"height: 208.8pt;\"><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;height: 208.8pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 已知瑕疵及权利 </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 负担 </span></p><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" valign=\"top\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;height: 208.8pt;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p style=\"margin: 0.0cm 0.0cm 0.0pt;text-indent: 30.0pt;\"><span style=\"color: #000000;font-size: 15.0pt;\"> </span></p><span style=\"color: #000000;\">  </span></td></tr><tr><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 优先购买权人 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 15.0pt;\"> 无 </span></p><span style=\"color: #000000;\">  </span></td></tr><tr style=\"height: 61.85pt;\"><td width=\"95\" style=\"border-width: 0.0px 1.0pt 1.0pt;border-style: none solid solid;border-color: #000000 black black;padding: 0.0cm 5.4pt;width: 70.9pt;height: 61.85pt;background-color: transparent;\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #000000;font-size: 15.0pt;\"> 备注 </span></p><span style=\"color: #000000;\">  </span></td><td width=\"491\" style=\"border-width: 0.0px 1.0pt 1.0pt 0.0px;border-style: none solid solid none;border-color: #000000 black black #000000;padding: 0.0cm 5.4pt;width: 13.0cm;height: 61.85pt;background-color: transparent;\" colspan=\"3\"><span style=\"color: #000000;\">  </span><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;\"><span style=\"color: #7f7f7f;font-size: 15.0pt;\"> </span></p><span style=\"color: #000000;\">  </span></td></tr></table><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;line-height: 30.0pt;\"><span style=\"color: #000000;\"><br></span></p><p style=\"margin: 0.0cm 0.0cm 0.0pt;\"><span style=\"color: #000000;font-size: 15.0pt;\"> </span></p><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;line-height: 30.0pt;\"><span style=\"color: #000000;\"><br></span></p><p align=\"center\" style=\"margin: 0.0cm 0.0cm 0.0pt;text-align: center;line-height: 30.0pt;\"><span style=\"color: #000000;\"><br></span></p>';\n"
html_string = html_string.replace("';\n", "").replace("var desc='","")
html_string = cleaner.clean_html(html_string)
print(html_string)
正文完
 0
评论(没有评论)