安装requests和bs4
安装requests
pip install requests
安装bs4
pip install bs4
爬取豆瓣电影数据
# 获取页面
import requests as req
# 1.向服务器发送请求
url="https://movie.douban.com/top250"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
}
html = req.get(url,headers=header).content.decode("utf-8")
# 2.保存文件到本地
with open("top250.html","w",encoding="utf-8") as f:
f.write(html)
print(html)
数据处理
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("top250.html",encoding="utf-8"),"html.parser")
print(soup)
print(soup.find_all("a",class_=""))
获取所有标题
# 这里获取的爬取之后保存到本地的文件
soup = BeautifulSoup(open("hkiii.html",encoding="utf-8"),"html.parser")
text = soup.select(".loglist_title")
# 获取标签内的文字
for t in text:
print(t.text)
爬取结果