本文主要是介绍python3.0以上版本爬取豆瓣电影,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
import re
import urllib.request"""
1.获取排名前100的电影
2.每页电影的数量是25个
3.获得的html页面中,跟电影名相关的有是这行:<span class="title">肖申克的救赎</span>如下这行跟上一行的结构类似,所以正则匹配的时候需要将此行去掉:<span class="title"> / The Shawshank Redemption</span>
"""
class SpiderDouBan(object):#在豆瓣上爬取数据def __init__(self):#初始化self.page = 0self.base_url = "http://movie.douban.com/top250?start={page}&filter=&type="self.result = []self._index = 1def get_page_html(self):#获取页面url = self.base_urltry:my_page = urllib.request.urlopen(url.format(page = self.page * 25)).read().decode("utf-8")except urllib.error.URLError as e:if hasattr(e, "code"):print ("The server couldn't fulfill the request.")print ("Error code: %s" % e.code)elif hasattr(e, "reason"):print ("We failed to reach a server. Please check your url and read the Reason")print ("Reason: %s" % e.reason)return my_page#返回页面def get_index_and_name(self,my_page):tmp_result = []items = re.findall(r'<span.*?class="title">(.*?)</span>', my_page, re.S)for item in items:if item.find(" ") == -1 :tmp_result.append("top" + str(self._index) + " " + item)self._index += 1self.result.extend(tmp_result)def start_spider(self):while self.page <= 3:my_page = self.get_page_html()self.get_index_and_name(my_page)self.page += 1def main():spider = SpiderDouBan()spider.start_spider()for item in spider.result:print (item)if __name__ == '__main__':main()
#这是https://blog.csdn.net/bitcarmanlee/article/details/67661958的博主大佬写的,但他的是python2.7的版本,有些小地方要改改,借光啦
这篇关于python3.0以上版本爬取豆瓣电影的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!