BeautifulSoup(三)- - - 爬取豆瓣电影信息数据写入数据库

数据库表结构

CREATE TABLE `movie` (
  `title` varchar(200) DEFAULT NULL,
  `directedBy` varchar(200) DEFAULT NULL,
  `screenwriter` varchar(200) DEFAULT NULL,
  `starring` varchar(400) DEFAULT NULL,
  `genre` varchar(200) DEFAULT NULL,
  `country` varchar(200) DEFAULT NULL,
  `language` varchar(200) DEFAULT NULL,
  `initialReleaseDate` varchar(200) DEFAULT NULL,
  `runtime` varchar(200) DEFAULT NULL,
  `othername` varchar(200) DEFAULT NULL,
  `IMDb` varchar(200) DEFAULT NULL,
  `score` float DEFAULT NULL,
  `summary` text
) ENGINE=InnoDB DEFAULT CHARSET=utf8

python脚本

#coding=utf-8
#设置编码为utf-8,便于后续处理中文

from __future__ import print_function
import urllib2
from bs4 import BeautifulSoup
import json
import requests
import MySQLdb
import random


#构造函数读取html
def getHtlm(url):
    page=urllib2.urlopen(url, timeout=random.randint(0,10))
    html=page.read()
    return html

###设置数据库连接
config={
    "host":"127.0.0.1",
    "port":3306,
    "user":"xxx",
    "passwd":"xxx",
    "db":"xxx",
    "charset":"utf8"
}

conn=MySQLdb.connect(**config)
curr = conn.cursor()


#movie_list用来获取页面链接。例子中采用了豆瓣「选电影」页面:https://movie.douban.com/explore
movie_list=[]
link_list=[]

for i in range(10):
#选电影页面点击「加载更多」,采用的是get请求,返回为json格式数据,每次请求返回20个
    r = requests.get("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BB%8F%E5%85%B8&sort=recommend&page_limit=20&page_start="+str((i+1)*20), timeout=random.randint(0,8))
#Requests 中也有一个内置的 JSON 解码器,将返回的内容解码,将数据转化为python格式    
    for j in r.json()["subjects"]:
        html_test = getHtlm(j["url"])
        movie = BeautifulSoup(html_test, "lxml")
        
        ##初始化变量
        title=""
        directedBy=""
        screenwriter=""
        starring=""
        genre=""
        country=""
        language=""
        initialReleaseDate=""
        runtime=""
        othername=""
        IMDb=""
        score=""
        average=""
        summary=""

        title=movie.body.find(property="v:itemreviewed").string.encode("utf-8")

        directedBy=movie.body.find(rel="v:directedBy").string.encode("utf-8")

        #screenwriter
        for i in  movie.body.findAll("span",text="编剧"):
            for j in i.next_sibling.next_sibling:
                if(j.string == " / "):
                    screenwriter = screenwriter+","
                else:
                    screenwriter = screenwriter + j.string.encode("utf-8")

        #starring,主演         
        starring=""
        for i in movie.body.findAll(rel="v:starring"):
            if(i == movie.body.findAll(rel="v:starring")[-1]):
                starring=starring+i.string.encode("utf-8")
            else:
                starring=starring+i.string.encode("utf-8")+","


        #genre,电影类型
        genre=""
        for i in movie.body.findAll(property="v:genre"):
            if(i == movie.body.findAll(property="v:genre")[-1]):
                genre=genre+ i.string.encode("utf-8")
            else:genre=genre+ i.string.encode("utf-8")+","
        
        #country,国家地区
        #对待此类数据<span class="pl">制片国家/地区:</span> 中国大陆<br/> 如果需要获取</br>前面的部分,需要使用next_sibling
        country=movie.body.find("span",text="制片国家/地区:").next_sibling.encode("utf-8")

        #country,语言
        language=movie.body.find("span",text="语言:").next_sibling.encode("utf-8")

        #initialReleaseDate,上映时间
        initialReleaseDate=""
        for i in movie.body.findAll(property="v:initialReleaseDate"):
            if(i == movie.body.findAll(property="v:initialReleaseDate")[-1]):
                initialReleaseDate =initialReleaseDate + i.string.encode("utf-8")
            else:
                initialReleaseDate =initialReleaseDate + i.string.encode("utf-8")+","


        #runtime,时长
        runtime= movie.body.find(property="v:runtime").string.encode('utf-8')

        #othername,别名
        othername=movie.body.find("span",text="又名:").next_sibling.encode("utf-8")

        #IMDb,IMDb
        IMDb=movie.body.find(rel="nofollow",target="_blank").get("href")

        #score,得分
        score=movie.body.find(property="v:average").string.encode("utf-8")

        #summary,简介
        # summary=movie.body.find(property="v:summary").string.replace("\n","").replace(" ","").encode("utf-8")

        for i in movie.body.findAll(property="v:summary"):
            summary = summary + i.contents[0].replace("\n","").replace(" ","").encode("utf-8")


        a=(title, directedBy, screenwriter, starring, genre, country, language, initialReleaseDate, runtime, othername, IMDb, score, summary)
        sql ="""
        insert into movie values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """

        curr.execute(sql,a)
        conn.commit()

curr.close()
conn.close()
2017-08-04 00:49 45 技术
Comments
Write a Comment