Scrapy爬虫系列之豆瓣电影Top250

1. item.py(定义爬去的内容)


1
2
3
4
5
6
7
8
9
10
import scrapy

class Dbmoviestop250Item(scrapy.Item):
name = scrapy.Field() # 电影名字
year = scrapy.Field() # 上映年份
score = scrapy.Field() # 豆瓣分数
director = scrapy.Field() # 导演
classification = scrapy.Field() # 分类
actor = scrapy.Field() # 演员
image_urls = scrapy.Field() # 封面图片

2. spider 的编写


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector

from dbMoviesTop250.items import Dbmoviestop250Item

class MovieSpider(CrawlSpider):
name = 'movies'
allowed_domains = ['movie.douban.com']
start_urls = ['http://movie.douban.com/top250']

rules = [Rule(LinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
Rule(LinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),
callback='parse_item', follow=False)
]

def parse_item(self, response):

sel = Selector(response)

item = Dbmoviestop250Item()
item['name'] = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
item['score'] = sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
item['director'] = sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
item['classification'] = sel.xpath('//span[@property="v:genre"]/text()').extract()[0]
item['actor'] = sel.xpath('//*[@id="info"]/span[3]//a/text()').extract()[0]
item['image_urls'] = sel.xpath('//div[@id="mainpic"]/a[@class="nbgnbg"]/img/@src').extract()

return item

3. PipeLine 中处理数据及图片下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline

class MyImagesPipeline(ImagesPipeline):

def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'item': item})

def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item

def file_path(self, request, response=None, info=None):
item = request.meta['item']
name = item['name']
filename = u'full/{0}.jpg'.format(name)
return filename

4. setting 中设置几个变量

1
2
3
4
5
6

IMAGES_STORE = '.' # 表示图片文件夹为当前目录

ITEM_PIPELINES = {
'dbMoviesTop250.pipelines.MyImagesPipeline': 300,
}

5. 结果


写好后保存然后在目录下运行 scrapy crawl movies -o data.json 等待一会, 即可在目录下看到 data.json 文件如下:

和 封面图片都被下载下来咯

项目代码, 点我下载。

这样Top250电影的相关信息就被我们拿到啦。 赶快试试吧!!!