爬虫小练习：爬取内涵段子指定页数段子（可控制是否继续爬取）

本文主要是介绍爬虫小练习：爬取内涵段子指定页数段子（可控制是否继续爬取），希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

import urllib.request
import re# pattern1 = re.compile('<a\shref="(.*?)"\sclass="title"\stitle')  匹配完整段子内容链接
#
# content_url_list = pattern1.findall(html)
#
# pattern2 = re.compile('</p>(.*?)<div\sclass="ad610">',re.S)  匹配点开段子标题后完整段子的内容
#
# content_list = pattern2.findall(html)
#
# http://www.neihan8.com/article/index_3.html
#
# User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36class Spider:def __init__(self,page):self.page = pageself.switch = True  # 爬取开关，决定用户是否继续爬取页面信息def loadPage(self):'''下载页面'''# 下载第一部分页面来获取完整段子内容的连接，且打开链接print("页面下载中......")if self.page == "1":url = "http://www.neihan8.com/article/index.html"else:url = "http://www.neihan8.com/article/index_"+ self.page +".html"headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}request = urllib.request.Request(url,headers=headers)response = urllib.request.urlopen(request)html = response.read().decode("utf-8")# print(html)pattern1 = re.compile('<a\shref="(.*?)"\sclass="title"\stitle')content_url_list = pattern1.findall(html)print("页面下载完成！")for content_url in content_url_list:# print(url)content_url = "http://www.neihan8.com" + content_urlrequest = urllib.request.Request(content_url, headers=headers)response = urllib.request.urlopen(request)html = response.read().decode("utf-8")pattern2 = re.compile('</p>(.*?)<div\sclass="ad610">', re.S)content_list = pattern2.findall(html)self.dealPage(content_list)def dealPage(self,content_list):'''处理每页的段子信息'''for content in content_list:# print(content)# print("-" * 30)content = content.replace('<p>','').replace('</p>','')# print(content)# print("-" * 30)self.writPage(content)def writPage(self,content):'''把段子信息写入文件中'''print("文件写入中......")with open("内涵段子第"+ self.page +"页集合.txt","a") as f:f.write(content)f.write("\n" + ("-"*50))def work(self):'''控制爬虫如何运行'''print("文件写入完成！感谢使用！")while self.switch:command = input("如果确定继续爬取，请按回车（退出按q）：")if command == "q":self.switch = Falseelse:page_num = input("请输入要再次爬取的页码：")self.page = page_numself.loadPage()if __name__ == '__main__':page_num = input("请输入要爬取的页码：")Spider = Spider(page_num)Spider.loadPage()Spider.work()

这篇关于爬虫小练习：爬取内涵段子指定页数段子（可控制是否继续爬取）的文章就介绍到这儿，希望我们推荐的文章对编程师们有所帮助！