使用python爬取妹子图

Posted by Tesla9527 on April 22, 2018

参考博客https://blog.csdn.net/baidu_35085676/article/details/68958267,爬取逻辑一样,只是将请求与解析库改为了使用kennethreitz大神的requests-html

脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from requests_html import HTMLSession
import os


home_url = 'http://www.mzitu.com'

# http请求头
HostReferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://www.mzitu.com'
}
# 此请求头破解盗链
PicReferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://i.meizitu.net'
}
path = "E:\\mzitu\\"  # 保存地址
session = HTMLSession()
r = session.get(home_url, headers=HostReferer)
max_page = r.html.find('.page-numbers')[-2].text  # 找到最大页数

page_url = 'http://www.mzitu.com/page/'
for i in range(1, int(max_page) + 1):
    url = page_url + str(i)
    r = session.get(url, headers=HostReferer)
    all_a = r.html.find('.postlist')[0].find(
        'a[target=_blank]')  # 获取当前页的所有a标签元素
    for a in all_a:
        title = a.text
        if title != '':
            print("准备爬取:" + title)
            title_path = title.strip().replace('?', '').replace(
                ':', '')  # 解决win下不能创建带'?',':'的目录的问题
            final_path = path + title_path
            if os.path.exists(final_path):
                flag = 1
            else:
                os.makedirs(final_path)
                flag = 0
            os.chdir(final_path)
            href = list(a.links)[0]  # 获取当前a标签元素的link
            r = session.get(href, headers=HostReferer)
            pic_max = r.html.find('span')[10].text  # 获取当前a标签跳转link后的图片最大页数
            print('图片数: ' + pic_max)
            if(flag == 1 and len(os.listdir(final_path)) >= int(pic_max)):
                print('已经保存完毕,跳过')
                continue
            for num in range(1, int(pic_max) + 1):
                pic = href + '/' + str(num)
                r = session.get(pic, headers=HostReferer)
                pic_url = r.html.find(
                    '.main-image')[0].search('src="{}"')[0]  # 图片的最终url
                print(pic_url)
                html = session.get(pic_url, headers=PicReferer)
                file_name = pic_url.split(r'/')[-1]
                f = open(file_name, 'wb')
                f.write(html.content)
                f.close()
            print('爬取完成: ' + title)
            print('-----------------------------------')
    print('第', i, '页爬取完成')

爬取的福利图示例 img