Scrapy爬虫系列之极客学院视频

1. item.py(定义爬去的内容)


1
2
3
4
5
6
7
from scrapy import Item, Field

class JikexueyuanItem(Item):
course_id = Field()
course_name = Field()
course_url = Field()
course_path = Field()

2. spider的编写


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
from scrapy import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from jikexueyuan.items import JikexueyuanItem

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class CourseSpider(Spider):
name = "course"
baseurl = "http://www.jikexueyuan.com/course/"
allowed_domains = ["http://www.jikexueyuan.com/", "search.jikexueyuan.com", "jikexueyuan.com"]
start_urls = [
# 'http://www.jikexueyuan.com/course/?pageNum=%d' % i for i in xrange(1, 86)
'http://www.jikexueyuan.com/course/?pageNum=1'
]

def __init__(self):
self.cookies = {your_cookies}

def parse(self, response):
s_total = Selector(text=response.body).xpath("//ul[@class='cf']/li/div[@class='lessonimg-box']/a/@href").extract()

if len(s_total) > 0:
for page in s_total:
yield Request(page, callback=self.get_course_page, cookies=self.cookies)
else:
pass

def get_course_page(self, response):
x_course = Selector(text=response.body).xpath("//ul/li/div[@class='text-box']/h2/a")
for x in x_course:
try:
href = x.xpath('@href').extract()[0]
title = x.xpath('text()').extract()[0]

meta = {}
meta['href'] = href
meta['title'] = title
yield Request(href, callback=self.get_down_urls, meta={'meta': meta}, cookies=self.cookies)
except:
pass

def get_down_urls(self, response):
meta = response.meta['meta']
path = Selector(text=response.body).xpath("//div[@class='crumbs']/div[@class='w-1000']/a/text()").extract()
course_down = re.findall(r'source src="(.*?)"', response.body, re.S)
item = JikexueyuanItem()
if course_down:
item['course_id'] = meta['href']
item['course_name'] = meta['title']
item['course_url'] = course_down[0]
item['course_path'] = path
yield item

3. 结果


写好后保存然后在目录下运行 scrapy crawl course -o data.json 等待一会, 即可在目录下看到 data.json 文件如下:

项目代码, 点我下载。

这样所有视频(8111个)的名字、下载地址就被我们拿到啦。 赶快试试吧