安装charles抓包工具
参考地址:
https://blog.csdn.net/HeyShHeyou/article/details/90045204
https://www.cnblogs.com/xiao-xue-di/p/12720995.html#_label1
配置好charles后,打开微信小程序,查看charles抓取到的内容
python安装scrapy
执行以下命令安装scrapy
pip install scrapy
关于scrapy使用参考链接
https://blog.csdn.net/ck784101777/article/details/104468780/
https://www.jianshu.com/p/cecb29c04cd2
理解以下scrapy运行机制
Scrapy框架常用命令
创建项目:scrapy startproject xxx
进入项目:cd xxx #进入某个文件夹下
创建爬虫:scrapy genspider xxx(爬虫名) xxx.com (爬取域)
生成文件:scrapy crawl xxx -o xxx.json (生成某种类型的文件)
运行爬虫:scrapy crawl XXX
列出所有爬虫:scrapy list
获得配置信息:scrapy settings [options]
创建scrapy项目
scrapy startproject books
进入项目
cd books
创建爬虫
scrapy genspider book csapi.weimiaocaishang.com
修改配置
USER_AGENT = 'Mozilla/5.0'
# 是否遵守爬虫规则
ROBOTSTXT_OBEY = False
# 设置pipeline
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
'books.pipelines.BooksPipeline': 300,
# 图片下载Pipeline
'books.pipelines.CoverImgPipeline': 400,
# 文件下载
'books.pipelines.AudioFilesPipeline': 401,
}
# 图像下载路径
IMAGES_STORE = './storage'
# 文件下载路径
FILES_STORE = './storage'
设置items容器,用于存放爬取到的书,定义好需要的字段
import scrapy
class BooksItem(scrapy.Item):
# define the fields for your item here like:
three_id = scrapy.Field()
bookname = scrapy.Field()
bookcontentimg = scrapy.Field()
bookcontentword = scrapy.Field()
bookcore = scrapy.Field()
bookcontent = scrapy.Field()
audiofiles = scrapy.Field()
bookauthor = scrapy.Field()
coverimg = scrapy.Field()
playtime = scrapy.Field()
bookheart = scrapy.Field()
生成爬虫文件
scrapy genspider book csapi.weimiaocaishang.com
book.py 爬虫文件代码如下:
import scrapy
import json
from ..items import BooksItem
class BookSpider(scrapy.Spider):
# 蜘蛛名称
name = 'book'
# 可爬域名
allowed_domains = ['csapi.weimiaocaishang.com']
# 爬取地址
start_urls = ['https://csapi.weimiaocaishang.com/WxBookTags/categoryBookList']
token = '通过charles抓包获取'
page = 0
def start_requests(self):
"""重写发起请求"""
# 请求地址
post_url = 'https://csapi.weimiaocaishang.com/WxBookTags/categoryBookList'
# 请求数据
self.page += 1
data = {"cate": 999, "second_cate": 999, "page": self.page, "is_listen_type": 0, "sort_type": "shelf_time_desc", "token": self.token}
# 发送post请求
yield scrapy.http.JsonRequest(url=post_url, data=data, callback=self.parse_book_list)
def parse_book_list(self, response):
"""
解析列表
:param response:
:return:
"""
result = json.loads(response.text)
if result['statusCode'] == '200':
if len(result['data']) > 0:
# 请求地址
post_url = 'https://csapi.weimiaocaishang.com/WxBookTags/categoryBookList'
# 请求数据
self.page += 1
print('当前第%d页' % self.page)
data = {"cate": 999, "second_cate": 999, "page": self.page, "is_listen_type": 0, "sort_type": "shelf_time_desc",
"token": self.token}
# 下一页
yield scrapy.http.JsonRequest(url=post_url, data=data, callback=self.parse_book_list)
for item in result['data']:
# 爬取实录文稿
post_url = 'https://csapi.weimiaocaishang.com/WxBook/manuscript'
# 请求数据
data = {"book_id": item['id'], "token": self.token}
# 发送post请求
yield scrapy.http.JsonRequest(url=post_url, data=data, callback=self.parse_book_manuscript)
def parse_book_manuscript(self, response):
"""
解析实录文稿
:param response:
:return:
"""
items = BooksItem()
result = json.loads(response.text)
if result['statusCode'] == '200':
# 详情信息
detail = result['data']['book_info'][0]
items['three_id'] = detail['id']
items['bookname'] = detail['bookname']
items['bookcontentimg'] = detail['bookcontentimg']
items['bookcontentword'] = detail['bookcontentword']
items['bookcontent'] = detail['bookcontent']
items['bookcore'] = detail['bookcore']
items['audiofiles'] = detail['audiofiles']
items['bookauthor'] = detail['bookauthor']
items['coverimg'] = detail['coverimg']
items['playtime'] = detail['playtime']
items['bookheart'] = detail['bookheart']
# 将抓取到的书籍内容放入容器中
yield items
爬虫代码解释:
通过重写start_requests方法,使爬虫通过JsonRequest请求对象发起post请求,去请求接口数据拿到数据后丢给回调方法parse_book_list解析
通过json.loads()方法将数据转为python字典,判断字典中data是否有数据,如果有,重复上面的流程,生成另一个JsonRequest请求对象请求下一页
将拿到的列表数据循环通过ID请求详情接口,拿到详情数据,丢给parse_book_manuscript回调方法
解析出详情数据丢给BooksItem
pipelines.py文件代码
from scrapy.pipelines.images import ImagesPipeline
from scrapy.pipelines.files import FilesPipeline
import scrapy
import pymysql
import json
class BooksPipeline(object):
def __init__(self):
# 入库
# 打开数据库连接
self.db = pymysql.connect(
host='loclhost',
port=3306,
db='spy_book',
user='spy_book',
passwd='spy_book',
charset='utf8mb4',
use_unicode=True)
# 创建游标对象
self.cursor = self.db.cursor()
"""
书籍中间件
"""
def process_item(self, item, spider):
bookcontentimg = item['bookcontentimg'].replace('https://xiaobai-dushuhui.oss-cn-huhehaote.aliyuncs.com/test', '')
coverimg = item['coverimg'].replace('https://xiaobai-dushuhui.oss-cn-huhehaote.aliyuncs.com/test', '')
audiofiles = item['audiofiles'].replace('https://xiaobai-dushuhui.oss-cn-huhehaote.aliyuncs.com/test', '')
# 执行
sql = """insert into `book` (`three_id`,`bookname`,`bookcontentimg`,`bookcore`,`audiofiles`,`bookauthor`,`coverimg`,`playtime`,`bookcontentword`,`bookcontent`,`bookheart`) value (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
self.cursor.execute(sql, (str(item['three_id']),
str(item['bookname']),
str(bookcontentimg),
str(item['bookcore']),
str(audiofiles),
str(item['bookauthor']),
str(coverimg),
str(item['playtime']),
json.dumps(item['bookcontentword'], ensure_ascii=False),
str(item['bookcontent']),
json.dumps(item['bookheart'], ensure_ascii=False),
)
)
self.db.commit()
return item
class CoverImgPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['bookcontentimg'])
yield scrapy.Request(url=item['coverimg'])
def file_path(self, request, response=None, info=None):
"""
修改图片存放路径
:param request:
:param response:
:param info:
:return:
"""
filename = request.url.replace('https://xiaobai-dushuhui.oss-cn-huhehaote.aliyuncs.com/test/', '')
return filename
class AudioFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['audiofiles'])
def file_path(self, request, response=None, info=None):
"""
修改音频存放路径
:param request:
:param response:
:param info:
:return:
"""
filename = request.url.replace('https://xiaobai-dushuhui.oss-cn-huhehaote.aliyuncs.com/test/', '')
return filename
pipeline文件中对象启动需要再设置文件中配置ITEM_PIPELINES,代码在上面修改配置中