JustZiya

使用 BeautifulSoup 抓取豆瓣看过的电影列表

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import csv


def get_m(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    m = soup.find_all('li', 'item')
    for i in m:
        movie = []
        #豆瓣链接
        movie.append(i.a['href'])
        iName = i.a.get_text().split('/')
        if len(iName) > 2:
            movie.append(iName[0].strip())
            movie.append('/'.join(iName[1:]).strip())
        elif len(iName) > 1:
            for n in iName:
                #中文名和原名
                movie.append(n.strip())
        else:
            movie.append(iName[0].strip())
            movie.append(iName[0].strip())
        i2 = i.find('div', 'date')
        #标记日期
        movie.append(i2.get_text().strip())
        #评分
        try:
            movie.append(i2.span['class'][0][-3])
        except:
            movie.append('0')
        i3 = i.find('span', 'tags')
        try:
            movie.append(i3.get_text().strip()[4:])
        except:
            movie.append(None)
        i4 = i.find('div', 'comment')
        if i4:
            #评论
            iComment = ' '.join('\\'.join(i4.stripped_strings).split())
            movie.append(iComment)
        else:
            movie.append(None)
        # print movie
        movie.append(None)
        with open('movie.csv', 'a+') as f:
            f_csv = csv.writer(f, delimiter='|')
            f_csv.writerow(movie)


if __name__ == '__main__':
    iD = 'annho'
    for s in range(0, 3635, 30):
        print(s) 
        url = 'https://movie.douban.com/people/%s/collect?sort=time&start=%s&filter=all&mode=list&tags_sort=count' % (
            iD, s)
        get_m(url)

评论