import time
import requests
import os
import random
import bs4
from bs4 import BeautifulSoup
# 越多越好
meizi_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 "
"Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 "
"Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 "
"Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
]
# 给请求指定一个请求头来模拟浏览器
global headers
headers = {'User-Agent': random.choice(meizi_headers)}
# 爬取地址
global url
url = 'https://www.mzitu.com/'
# 存储位置
global save_path
save_path = 'E:\python\study\day1\save_path'
def create_dir(file_path):
if os.path.exists(file_path) is False:
os.makedirs(file_path)
# 切换当前工作目录
os.chdir(file_path)
def download(page_no, file_path):
# 下载逻辑
# 发送get请求
global headers
res_sub = requests.get(page_no, headers=headers)
# 解析html
soup_sub = BeautifulSoup(res_sub.text, features="html.parser")
img_a = soup_sub.find('div', class_="postlist").find_all('a')
count = 0
for a in img_a:
count = count + 1
if (count % 2) == 0:
print('内页第几页:' + str(count))
# 提取href
href = a.attrs['href']
print('套图地址:' + href)
res_sub_1 = requests.get(href, headers=headers)
soup_sub_1 = BeautifulSoup(res_sub_1.text, features='html.parser')
try:
# 获取套图最大数量
max_page = soup_sub_1.find('div', class_="pagenavi").find_all('span')[6].text
print('套图数量:' + max_page)
for j in range(1, int(max_page) + 1):
print("子内页第几页:" + str(j))
# 等待1至3秒
time.sleep(random.randint(1, 3))
page_url_sub = href + '/' + str(j)
res_sub_2 = requests.get(page_url_sub, headers=headers)
soup_sub_2 = BeautifulSoup(res_sub_2.text, features='html.parser')
# 解析图片内容
img = soup_sub_2.find('div', class_="main-image").find('img')
if isinstance(img, bs4.element.Tag):
img_src = img.attrs['src']
array = img_src.split('/')
file_name = array[len(array)-1]
# 防盗链加入Referer
headers = {'User-Agent': random.choice(meizi_headers), 'Referer': page_url_sub}
img = requests.get(img_src, headers=headers)
print('开始保存图片')
f = open(file_name, 'ab')
f.write(img.content)
print('图片保存成功')
f.close()
except Exception as e:
print(e)
def main():
# 拼接url
# global url
# url = url + code + '/detail#/XJLLB'
# 发送get请求
res = requests.get(url, headers=headers)
# 使用自带的解析器解析
soup = BeautifulSoup(res.text, features="html.parser")
# 创建文件夹
create_dir(save_path)
# 获取首页总页数
img_max = soup.find('div', class_='nav-links').find_all('a')[3].text
for i in range(1, int(img_max) + 1):
# 获取每页的url地址
if i == 1:
page_url = url
else:
page_url = url + 'page/' + str(i)
# 文件存放地址
file_path = save_path + '\\' + str(i)
# 创建文件路径
create_dir(file_path)
# 下载每页图片
download(page_url, file_path)
break
if __name__ == '__main__':
main()
暂无评论