JustZiya

抓取豆瓣电影的 IMDB 链接

# -*- coding: utf-8 -*-

import requests
from peewee import *
import random
import datetime
import string
import re
from fake_useragent import UserAgent
from time import sleep

ua = UserAgent()
db = SqliteDatabase("movie.sqlite")


class BaseModel(Model):
    class Meta:
        database = db


class Movie(BaseModel):
    Douban_Url = CharField(unique=True)
    IMDB = CharField(default="")
    CN_name = CharField()
    OG_name = CharField()
    Update_time = DateTimeField(default=datetime.datetime.now)
    is_check = IntegerField(default=0)


def get_proxy():
    return requests.get("http://proxy_pool:5010/get/").content


def delete_proxy(proxy):
    requests.get("http://proxy_pool:5010/delete/?proxy={}".format(proxy))


def get_html(url,cookie):
    retry_count = 5
    proxy = get_proxy()
    headers = {"User-Agent": ua.random}
    while retry_count > 0:
        try:
            r = requests.get(
                url, headers=headers, proxies={"http": "http://{}".format(proxy)},cookies=cookie
            )
            # print(r.status_code)
            if r.status_code == 403:
                cookie = get_cookie()
            elif r.status_code == 404:
                print(url)
            html = r.text
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    print("访问出错")
    delete_proxy(proxy)
    return None


def get_link(url,cookie):
    # print(url)
    html = get_html(url,cookie)
    # sleep(10)
    linkstr = r"(http://www.imdb.com/title/.+?)\""
    try:
        link = re.findall(linkstr, html)[0]
    except:
        link = None
    return link

def get_cookie():
    c = "".join(random.sample(string.ascii_letters + string.digits, 11))
    print(c)
    return {"Cookie": "bid=%s" % c}

if __name__ == "__main__":
    # for i in Movie.select().where(Movie.is_check==0).limit(5):
    global cookie
    cookie = get_cookie()
    for i in Movie.select().where(Movie.is_check == 0).order_by(Movie.id.desc()):
        try:
            link = get_link(i.Douban_Url,cookie)
        except:
            pass
        if link:
            todo = Movie.update(
                IMDB=link, is_check=1, Update_time=datetime.datetime.now()
            ).where(Movie.id == i.id)
            todo.execute()
            print(i.CN_name)

评论