1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
| import pandas as pd import requests from bs4 import BeautifulSoup ''' 练习爬取豆瓣电影排名 top250 '''
movies = []
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }
for i in range(0, 250, 25): url = f"https://movie.douban.com/top250?start={i}" response = requests.get(url, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") items = soup.find_all("div", class_="item") for item in items: rank = item.find("em").text title = item.find("span", class_="title").text rating = item.find("span", class_="rating_num").text str1 = item.find("div", class_="star").text count = str1[7:-4] comment = "" str2 = item.find("span", class_="inq") if str2: comment = str2.text movie = { "排名": rank, "电影名": title, "评分": rating, "评价人数": count, "评论": comment } movies.append(movie)
df = pd.DataFrame(movies)
df.to_csv("douban_top250.csv", index=False, encoding="utf-8-sig")
|