实例代码
# 拿到页面的源代码 requests
# 通过re来提取想要的有效信息 re
import requests
import re
import csv
url = f"https://movie.douban.com/top250"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5226.400"
}
resp = requests.get(url, headers=headers)
page_content = resp.text
# 解析数据
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*?<span '
r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<num>.*?)人评价</span>', re.S)
# 开始匹配
result = obj.finditer(page_content)
f = open("data.csv", mode="w", encoding="utf-8", newline="")
csvwriter = csv.writer(f)
for it in result:
# print(it.group("name"))
# print(it.group("score"))
# print(it.group("num"))
# print(it.group("year").strip())
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
print("over!")
运行结果
肖申克的救赎,1994,9.7,2796176
霸王别姬,1993,9.6,2069264
阿甘正传,1994,9.5,2094883
泰坦尼克号,1997,9.5,2056649
这个杀手不太冷,1994,9.4,2230003
美丽人生,1997,9.6,1287121
千与千寻,2001,9.4,2170804
辛德勒的名单,1993,9.6,1072512
盗梦空间,2010,9.4,2002284
星际穿越,2014,9.4,1759421
楚门的世界,1998,9.4,1635225
忠犬八公的故事,2009,9.4,1360270
海上钢琴师,1998,9.3,1630329
三傻大闹宝莱坞,2009,9.2,1808437
放牛班的春天,2004,9.3,1274515
机器人总动员,2008,9.3,1279043
无间道,2002,9.3,1314501
疯狂动物城,2016,9.2,1861918
控方证人,1957,9.6,525103
大话西游之大圣娶亲,1995,9.2,1487733
熔炉,2011,9.4,905722
教父,1972,9.3,933062
当幸福来敲门,2006,9.2,1476119
触不可及,2011,9.3,1070550
怦然心动,2010,9.1,1778194
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END
暂无评论内容