Python爬虫
Python爬虫
1.任务介绍
爬取豆瓣电影Top250的基本信息
2.爬虫初识
爬虫的本质就是模拟浏览器打开网页,获取网页中我们想要的那部分数据。
3.基本流程
3.1 准备工作
导入包
1 2 3 4 5 6
| import bs4 import re import urllib.request import urllib.error import xlwt import sqlite3
|
3.2 获取数据
1 2 3 4 5 6 7 8 9 10 11 12
| import urllib.request
response = urllib.request.urlopen("http://www.baidu.com") print(response.read().decode('utf-8'))
import urllib.parse data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post", data=data) print(response.read().decode('utf-8'))
|
超时处理
1 2 3 4 5 6
| try: response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.01) print(response.read().decode('utf-8')) except urllib.error.URLError as e: print("Time out")
|
获取豆瓣数据
1 2 3 4 5 6 7
| url = "https://www.douban.com" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" } req = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(req) print(response.read().decode("utf-8"))
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| from bs4 import BeautifulSoup
file = open("./baidu.html", "rb") html = file.read() bs = BeautifulSoup(html, "html.parser")
print(bs.title) print(bs.title.string)
''' 结果: <title>百度一下,你就知道</title> 百度一下,你就知道 '''
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| print(bs.head.contents) print("-"*30) print(bs.head.contents[1])
t_list = bs.find_all("a") print(t_list)
t_list = bs.find_all(re.compile("a")) print(t_list)
t_list = bs.find_all(id="head") t_list = bs.find_all(class_=True)
for item in t_list: print(item)
t_list = bs.find_all(text="hao123") t_list = bs.find_all(text=re.compile("\d")) for item in t_list: print(item)
t_list = bs.find_all("a", limit=3) for item in t_list: print(item)
t_list = bs.select('title')
t_list = bs.select('.mnav')
t_list = bs.select('#u1')
t_list = bs.select("a[class='s_tab']")
t_list = bs.select("head > title")
for item in t_list: print(item)
|
3.3 解析数据
正则表达式
注:建议在正则表达式中,被比较的字符串前面加上r,不用担心转义字符的问题
search方法进行比较查找
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| import re
pat = re.compile("AA") print(pat.search("CBA")) print(pat.search("ABCAA")) print(pat.search("ABCAADDHHAAA"))
''' 结果: None <re.Match object; span=(3, 5), match='AA'> <re.Match object; span=(3, 5), match='AA'> '''
|
1 2 3 4 5 6 7 8 9 10 11 12
| import re
print(re.search("asd", "Aasd")) print(re.findall("a", "Asdhasiohoiask")) print(re.findall("[A-Z]", "AsdhasEFohSDask")) ''' 结果: <re.Match object; span=(1, 4), match='asd'> ['a', 'a'] ['A', 'E', 'F', 'S', 'D'] '''
|
替换
1 2 3 4 5 6 7 8
| import re
print(re.sub("a", "A", "adhsjiaaadw"))
''' 结果: AdhsjiAAAdw '''
|
例:打印99乘法表,并写入excel表中
1 2 3 4 5 6 7 8
| import xlwt
workbook = xlwt.Workbook(encoding="utf-8") worksheet = workbook.add_sheet('sheet1') for i in range(0, 9): for j in range(0, i+1): worksheet.write(i, j, "%d * %d = %d" % (i+1, j+1, (i+1)*(j+1))) workbook.save('student.xls')
|
保存数据到Excel中
1 2 3 4 5 6 7 8 9 10 11 12 13
| def saveData(dataList, savePath): workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) worksheet = workbook.add_sheet('豆瓣电影TOP250', cell_overwrite_ok=True) col = ("电影详情链接", "图片链接", "影片中文名", "影片外文名", "评分", "评价人数", "概况", "相关信息") for i in range(0, 8): worksheet.write(0, i, col[i]) for i in range(0, 250): print("第%d条" % (i+1)) data = dataList[i] for j in range(0, 8): worksheet.write(i+1, j, data[j])
workbook.save(savePath)
|