BeautifulSoup(二)- - - 爬取豆瓣电影信息数据

豆瓣页面查看html页面部分如下

#主题部分            
            
    <div id="dale_movie_subject_top_icon"></div>
    <h1>
        <span property="v:itemreviewed">绣春刀II:修罗战场</span>
            <span class="year">(2017)</span>
    </h1>

    

<div id="info">
        <span ><span class='pl'>导演</span>: <span class='attrs'><a href="/celebrity/1321200/" rel="v:directedBy">路阳</a></span></span><br/>
        <span ><span class='pl'>编剧</span>: <span class='attrs'><a href="/celebrity/1317594/">陈舒</a> / <a href="/celebrity/1321200/">路阳</a> / <a href="/celebrity/1377577/">禹扬</a></span></span><br/>
        <span class="actor"><span class='pl'>主演</span>: <span class='attrs'><a href="/celebrity/1077991/" rel="v:starring">张震</a> / <a href="/celebrity/1052359/" rel="v:starring">杨幂</a> / <a href="/celebrity/1274761/" rel="v:starring">张译</a> / <a href="/celebrity/1312940/" rel="v:starring">雷佳音</a> / <a href="/celebrity/1318720/" rel="v:starring">辛芷蕾</a> / <a href="/celebrity/1275482/" rel="v:starring">金士杰</a> / <a href="/celebrity/1376605/" rel="v:starring">刘端端</a> / <a href="/celebrity/1351719/" rel="v:starring">武强</a> / <a href="/celebrity/1342478/" rel="v:starring">杨轶</a> / <a href="/celebrity/1314374/" rel="v:starring">李媛</a> / <a href="/celebrity/1315721/" rel="v:starring">吴晓亮</a> / <a href="/celebrity/1317230/" rel="v:starring">李洪涛</a> / <a href="/celebrity/1332806/" rel="v:starring">刘峰超</a> / <a href="/celebrity/1274820/" rel="v:starring">袁文康</a> / <a href="/celebrity/1371225/" rel="v:starring">马赫</a> / <a href="/celebrity/1326377/" rel="v:starring">刘亭作</a> / <a href="/celebrity/1322077/" rel="v:starring">姜晓冲</a> / <a href="/celebrity/1376606/" rel="v:starring">陈齐威</a> / <a href="/celebrity/1319834/" rel="v:starring">王仁君</a></span></span><br/>
        <span class="pl">类型:</span> <span property="v:genre">剧情</span> / <span property="v:genre">动作</span> / <span property="v:genre">武侠</span> / <span property="v:genre">古装</span><br/>
        
        <span class="pl">制片国家/地区:</span> 中国大陆<br/>
        <span class="pl">语言:</span> 汉语普通话<br/>
        <span class="pl">上映日期:</span> <span property="v:initialReleaseDate" content="2017-07-19(中国大陆)">2017-07-19(中国大陆)</span> / <span property="v:initialReleaseDate" content="2017-06-18(上海电影节)">2017-06-18(上海电影节)</span><br/>
        <span class="pl">片长:</span> <span property="v:runtime" content="120">120分钟</span><br/>
        <span class="pl">又名:</span> 绣春刀2:修罗战场 / 绣春刀:修罗场 / 绣春刀前传<br/>
        <span class="pl">IMDb链接:</span> <a href="http://www.imdb.com/title/tt7055592" target="_blank" rel="nofollow">tt7055592</a><br>

</div>


#得分部分    
<div class="rating_self clearfix" typeof="v:Rating">
    <strong class="ll rating_num" property="v:average">7.6</strong>
    <span property="v:best" content="10.0"></span>
    <div class="rating_right ">
        <div class="ll bigstar bigstar40"></div>
        <div class="rating_sum">
                <a href="collections" class="rating_people"><span property="v:votes">83908</span>人评价</a>
        </div>
    </div>
</div>



##简介部分
            <div class="indent" id="link-report">
                    
                        <span property="v:summary" class="">
                                  明天启七年,北镇抚司锦衣卫沈炼(张震 饰)在一次扫除乱党任务中,为救画师北斋(杨幂 饰),将同僚凌云铠(武强 饰)灭口。此后一方面要摆脱来自陆文昭(张译 饰)、裴纶(雷佳音 饰)的质疑与调查,一方面又在神秘女子的要挟下放火烧了锦衣卫经历司。裹挟在乱世,沈炼与北斋情陷其中,却越陷越深。而在这一切的背后,巨大阴谋正暗中布局。众生如蝼蚁囿于修罗场,逆鳞之战,一触即发……
                        </span>
                        
            </div>

获取基础数据

#coding=utf-8
#设置编码为utf-8,便于后续处理中文

import urllib2
from bs4 import BeautifulSoup
import json
from __future__ import print_function

#构造函数读取html
def getHtlm(url):
    page=urllib2.urlopen(url)
    html=page.read()
    return html

#获取html的内容,以《绣春刀II电影页面为基础》
html_test = getHtlm("https://movie.douban.com/subject/26270502/?from=showing")

##将获取到的结果放入moive变量
movie = BeautifulSoup(html_test, "lxml")

初始化准备获取的字段

title=""
directedBy=""
screenwriter=""
starring=""
genre=""
country=""
language=""
initialReleaseDate=""
runtime=""
othername=""
IMDb=""
score=""
average=""
summary=""


##逐步获取每一个变量的结果,所有获取结果的中文全部都转变成utf-8编码

#title,电影名称
title=movie.body.find(property="v:itemreviewed").string.encode("utf-8")

#directedBy,导演
directedBy=movie.body.find(rel="v:directedBy").string.encode("utf-8")

#screenwriter,编剧
screenwriter=""
for i in  movie.body.findAll("span",text="编剧"):
    for j in i.next_sibling.next_sibling:
        if(j.string == " / "):
            screenwriter = screenwriter+","
        else:
            screenwriter = screenwriter + j.string.encode("utf-8")

#starring,主演         
starring=""
for i in movie.body.findAll(rel="v:starring"):
    if(i == movie.body.findAll(rel="v:starring")[-1]):
        starring=starring+i.string.encode("utf-8")
    else:
        starring=starring+i.string.encode("utf-8")+","


#genre,电影类型
genre=""
for i in movie.body.findAll(property="v:genre"):
    if(i == movie.body.findAll(property="v:genre")[-1]):
        genre=genre+ i.string.encode("utf-8")
    else:genre=genre+ i.string.encode("utf-8")+","
    
#country,国家地区
#对待此类数据<span class="pl">制片国家/地区:</span> 中国大陆<br/> 如果需要获取</br>前面的部分,需要使用next_sibling
country=movie.body.find("span",text="制片国家/地区:").next_sibling.encode("utf-8")

#country,语言
language=movie.body.find("span",text="语言:").next_sibling.encode("utf-8")

#initialReleaseDate,上映时间
initialReleaseDate=""
for i in movie.body.findAll(property="v:initialReleaseDate"):
    if(i == movie.body.findAll(property="v:initialReleaseDate")[-1]):
        initialReleaseDate =initialReleaseDate + i.string.encode("utf-8")
    else:
        initialReleaseDate =initialReleaseDate + i.string.encode("utf-8")+","

        
#runtime,时长
runtime= movie.body.find(property="v:runtime").string.encode('utf-8')

#othername,别名
othername=movie.body.find("span",text="又名:").next_sibling.encode("utf-8")

#IMDb,IMDb
IMDb=movie.body.find(rel="nofollow",target="_blank").get("href")

#score,得分
score=movie.body.find(property="v:average").string.encode("utf-8")

#summary,简介;使用contests(会返回一个list,默认取第一个)是为了防止summary里面出现<br />
#这样做忽略了<br />后面的内容
for i in movie.body.findAll(property="v:summary"):
    summary = summary + i.contents[0].replace("\n","").replace(" ","").encode("utf-8")

构造json格式

data_new={
    "title":title,
    "directedBy":directedBy,
    "screenwriter":screenwriter,
    "starring":starring,
    "genre":genre,
    "country":country,
    "language":language,
    "initialReleaseDate":initialReleaseDate,
    "runtime":runtime,
    "othername":othername,
    "IMDb":IMDb,
    "score":score,
    "summary":summary
}

转化为json

json_str=json.dumps(data_new, ensure_ascii=False, indent=2)      
## 转化为json, ensure_ascii=False,不再以ascii码输出

输出结果

print(json_str)
{
  "othername": " 绣春刀2:修罗战场 / 绣春刀:修罗场 / 绣春刀前传", 
  "starring": "张震,杨幂,张译,雷佳音,辛芷蕾,金士杰,刘端端,武强,杨轶,李媛,吴晓亮,李洪涛,刘峰超,袁文康,马赫,刘亭作,姜晓冲,陈齐威,王仁君", 
  "genre": "剧情,动作,武侠,古装", 
  "summary": "  明天启七年,北镇抚司锦衣卫沈炼(张震饰)在一次扫除乱党任务中,为救画师北斋(杨幂饰),将同僚凌云铠(武强饰)灭口。此后一方面要摆脱来自陆文昭(张译饰)、裴纶(雷佳音饰)的质疑与调查,一方面又在神秘女子的要挟下放火烧了锦衣卫经历司。裹挟在乱世,沈炼与北斋情陷其中,却越陷越深。而在这一切的背后,巨大阴谋正暗中布局。众生如蝼蚁囿于修罗场,逆鳞之战,一触即发……", 
  "directedBy": "路阳", 
  "language": " 汉语普通话", 
  "title": "绣春刀II:修罗战场", 
  "country": " 中国大陆", 
  "screenwriter": "陈舒,路阳,禹扬", 
  "score": "7.6", 
  "IMDb": "http://www.imdb.com/title/tt7055592", 
  "initialReleaseDate": "2017-07-19(中国大陆),2017-06-18(上海电影节)", 
  "runtime": "120分钟"
}

完整版本

#coding=utf-8
#设置编码为utf-8,便于后续处理中文

import urllib2
from bs4 import BeautifulSoup
import json
from __future__ import print_function
import requests

#构造函数读取html
def getHtlm(url):
    page=urllib2.urlopen(url, timeout=2)
    html=page.read()
    return html

#movie_list用来获取页面链接。例子中采用了豆瓣「选电影」页面:https://movie.douban.com/explore
movie_list=[]
for i in range(1):
#选电影页面点击「加载更多」,采用的是get请求,返回为json格式数据,每次请求返回20个
    r = requests.get("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start="+str((1+1)*20), timeout=1)
#Requests 中也有一个内置的 JSON 解码器,将返回的内容解码,将数据转化为python格式    
    for j in r.json()["subjects"]:
        link_list.append(j["url"])

        
#        
for link in link_list:
    html_test = getHtlm(link)
    movie = BeautifulSoup(html_test, "lxml")
    
    ##初始化变量
    title=""
    directedBy=""
    screenwriter=""
    starring=""
    genre=""
    country=""
    language=""
    initialReleaseDate=""
    runtime=""
    othername=""
    IMDb=""
    score=""
    average=""
    summary=""

    title=movie.body.find(property="v:itemreviewed").string.encode("utf-8")

    directedBy=movie.body.find(rel="v:directedBy").string.encode("utf-8")

    #screenwriter
    for i in  movie.body.findAll("span",text="编剧"):
        for j in i.next_sibling.next_sibling:
            if(j.string == " / "):
                screenwriter = screenwriter+","
            else:
                screenwriter = screenwriter + j.string.encode("utf-8")

    #starring,主演         
    starring=""
    for i in movie.body.findAll(rel="v:starring"):
        if(i == movie.body.findAll(rel="v:starring")[-1]):
            starring=starring+i.string.encode("utf-8")
        else:
            starring=starring+i.string.encode("utf-8")+","


    #genre,电影类型
    genre=""
    for i in movie.body.findAll(property="v:genre"):
        if(i == movie.body.findAll(property="v:genre")[-1]):
            genre=genre+ i.string.encode("utf-8")
        else:genre=genre+ i.string.encode("utf-8")+","
    
    #country,国家地区
    #对待此类数据<span class="pl">制片国家/地区:</span> 中国大陆<br/> 如果需要获取</br>前面的部分,需要使用next_sibling
    country=movie.body.find("span",text="制片国家/地区:").next_sibling.encode("utf-8")

    #country,语言
    language=movie.body.find("span",text="语言:").next_sibling.encode("utf-8")

    #initialReleaseDate,上映时间
    initialReleaseDate=""
    for i in movie.body.findAll(property="v:initialReleaseDate"):
        if(i == movie.body.findAll(property="v:initialReleaseDate")[-1]):
            initialReleaseDate =initialReleaseDate + i.string.encode("utf-8")
        else:
            initialReleaseDate =initialReleaseDate + i.string.encode("utf-8")+","


    #runtime,时长
    runtime= movie.body.find(property="v:runtime").string.encode('utf-8')

    #othername,别名
    othername=movie.body.find("span",text="又名:").next_sibling.encode("utf-8")

    #IMDb,IMDb
    IMDb=movie.body.find(rel="nofollow",target="_blank").get("href")

    #score,得分
    score=movie.body.find(property="v:average").string.encode("utf-8")

    #summary,简介
    # summary=movie.body.find(property="v:summary").string.replace("\n","").replace(" ","").encode("utf-8")

    for i in movie.body.findAll(property="v:summary"):
        summary = summary + i.contents[0].replace("\n","").replace(" ","").encode("utf-8")
    
    data_new={
    "title":title,
    "directedBy":directedBy,
    "screenwriter":screenwriter,
    "starring":starring,
    "genre":genre,
    "country":country,
    "language":language,
    "initialReleaseDate":initialReleaseDate,
    "runtime":runtime,
    "othername":othername,
    "IMDb":IMDb,
    "score":score,
    "summary":summary
    }
    
    json_str=json.dumps(data_new, ensure_ascii=False, indent=2)
    print(json_str)
    movie_list.append(json_str)
{
  "othername": " 卑鄙的我3 / 坏蛋奖门人3(港) / 神偷奶爸3之小小兵", 
  "starring": "史蒂夫·卡瑞尔,克里斯汀·韦格,崔·帕克,米兰达·卡斯格拉夫,达纳·盖尔,内芙·沙雷尔,皮埃尔·科凡,史蒂夫·库根,朱莉·安德鲁斯,珍妮·斯蕾特,迈克尔·贝亚蒂,安迪·尼曼,阿德里安·奇斯卡托,布莱恩·T·德莱尼,肯·道里欧", 
  "genre": "喜剧,动画,冒险", 
  "summary": "  《神偷奶爸3》将延续前两部的温馨、搞笑风格,聚焦格鲁和露西的婚后生活,继续讲述格鲁和三个女儿的爆笑故事。“恶棍”奶爸格鲁将会如何对付大反派巴萨扎·布莱德,调皮可爱的小黄人们又会如何耍贱卖萌,无疑让全球观众万分期待。该片配音也最大程度沿用前作阵容,史蒂夫·卡瑞尔继续为男主角格鲁配音,皮埃尔·柯芬也将继续为经典角色小黄人配音,而新角色巴萨扎·布莱德则由《南方公园》主创元老崔·帕克为其配音。", 
  "directedBy": "凯尔·巴尔达", 
  "language": " 英语", 
  "title": "神偷奶爸3 Despicable Me 3", 
  "country": " 美国", 
  "screenwriter": "辛科·保罗,肯·道里欧", 
  "score": "7.0", 
  "IMDb": "http://www.imdb.com/title/tt3469046", 
  "initialReleaseDate": "2017-07-07(中国大陆),2017-06-14(安锡动画电影节),2017-06-30(美国)", 
  "runtime": "90分钟"
}
{
  "othername": " Wukong", 
  "starring": "彭于晏,倪妮,余文乐,俞飞鸿,欧豪,郑爽,乔杉,杨迪,巴音,王德顺", 
  "genre": "剧情,动作,奇幻", 
  "summary": "  这不是西游记的任何章节,这是悟空的故事,彼时孙悟空(彭于晏饰)还不是震撼天地的齐天大圣,他只是只桀傲不驯的猴子。天庭毁掉他的花果山以掌控众生命运,他便决心跟天庭对抗,毁掉一切戒律。在天庭,孙悟空遇到不能爱的阿紫(倪妮饰),一生的宿敌杨戬(余文乐饰),和思念昔日爱人阿月(郑爽饰)的天蓬(欧豪饰),他们的身份注定永生相杀,但其实不甘命运摆布的又何止孙悟空一人?却没想到反抗却带来更大的浩劫。他们所做的一切,究竟是不知天高地厚的热血轻狂,还是无奈宿命难改的压抑绝望?难道命运真的早已注定?悟空不服,他再次挥动金箍棒,要让这诸佛都烟消云散!", 
  "directedBy": "郭子健", 
  "language": " 汉语普通话", 
  "title": "悟空传", 
  "country": " 中国大陆", 
  "screenwriter": "郭子健,今何在,黄海,范文文,黄智亨", 
  "score": "5.4", 
  "IMDb": "http://www.imdb.com/title/tt6513406", 
  "initialReleaseDate": "2017-07-13(中国大陆)", 
  "runtime": "123分钟"
}
...

参考资料

1、urllib+BeautifulSoup无登录模式爬取豆瓣电影Top250

2、Beautiful Soup 4.2.0 documentation

3、python报错UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 4: ordinal not in range

4、python中json.dumps使用的坑以及字符编码

5、Python Requests 快速上手

2017-07-30 20:46 50 技术
Comments
Write a Comment