scrapy爬虫
pythonscrapy爬虫原创pythonscrapy爬虫大约 3 分钟约 844 字
安装
pip install Scrapy
创建项目
scrapy startproject tutorial
运行
scrapy crawl quotes
相关信息
quotes:爬虫文件名称,文件name属性值
保存爬虫数据为json
scrapy crawl quotes -O quotest.json
相关信息
需在parse方法内添加解析内容
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tags::text').getall(),
}
items模型数据
- 在items.py文件添加模型
class AppdemoItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
link = Field()
desc = Field()
- 在爬虫文件中parse方法添加模型数据
for quote in response.css('div.quote'):
item = AppdemoItem()
item.link=1
yield AppdemoItem()
Item Loaders
- 需要在items文件中配置模型
- 在parse中添加如下方法
l=ItemLoader(item=AppdemoItem(),response=response)
l.add_css("title",'title::text')
return l.load_item()
爬取淘宝
创建项目
scrapy startproject taobao
创建爬虫
scrapy genspider mytaobao https://www.taobao.com/
items.py
class TaobaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 店铺名
shop_name = scrapy.Field()
# 商品标题
title = scrapy.Field()
# 价格
price = scrapy.Field()
# 销量
buy_num = scrapy.Field()
# 地址
address = scrapy.Field()
# 图片链接
pic_url = scrapy.Field()
# 详情链接
detail_url = scrapy.Field()
mytaobao.py
import scrapy
import json,re
from taobao.items import TaobaoItem
class MytaobaoSpider(scrapy.Spider):
name = 'mytaobao'
allowed_domains = ['www.taobao.com','taobao.com','s.taobao.com']
def start_requests(self):
urls = [
'https://s.taobao.com/search?q=%E8%A3%99%E5%AD%90&sort=sale-desc&s=44']
cookie="_m_h5_tk=1cd8de065945ab3f4e341f304d19530a_1655271188481; _m_h5_tk_enc=145e0d75865f74fda590f8a40fd14439; xlly_s=1; _samesite_flag_=true; cookie2=12d5ef85f124f5aaddf7004995927c61; t=473c6653df68764bd121e75ac78fd73d; _tb_token_=e3ee5ee3e446e; enc=xCQgeIGFP2zROdOi0aUHn%2B3XDvyLWqv9YAH03OYeOx6DpFPlJnJaj8qZ%2FFRJE5Ju9tIZQlaYjVB2Nwf8blEKVg%3D%3D; cna=qZMcGwBL1RMCATz3G4XpiNng; sgcookie=E100LTrACKM2RljYFFXiRoUwBXZEq8B6lMyepJEKAOUoA19BuRdrypVbMGbcV8w1ndrf2mSIK3dcsgoH8BsCAxgTfdGsuvoLXZts%2BTjMpkFCWthlxf3FjHguqTMjpYSjx1pE; unb=2872236772; uc3=id2=UUBfQCLoMk06dA%3D%3D&vt3=F8dCvC3%2BnkyPCKu0dGE%3D&nk2=piIK%2FC31Ly%2Fo6YTR6z9qIW0%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; csg=84c47a65; lgc=%5Cu8BFA%5Cu8A001314055453562; cancelledSubSites=empty; cookie17=UUBfQCLoMk06dA%3D%3D; dnk=%5Cu8BFA%5Cu8A001314055453562; skt=75db6b0a8447c001; existShop=MTY1NTI2NDE5MQ%3D%3D; uc4=nk4=0%40pO1rSxGmF71fehdQkeWsJFCs%2BJnHSb9RazYJ%2Bg%3D%3D&id4=0%40U2LNbzraVXqP3MK77TSI0odX0RtA; tracknick=%5Cu8BFA%5Cu8A001314055453562; _cc_=URm48syIZQ%3D%3D; _l_g_=Ug%3D%3D; sg=223; _nk_=%5Cu8BFA%5Cu8A001314055453562; cookie1=BxvBC3XB0E2eMMNFh3Ug1JLf3kITdQy%2B%2BOmAkd14sL4%3D; mt=ci=45_1; thw=cn; x5sec=7b227365617263686170703b32223a22393264393634323330616462356362653037373761396137393463376562646443505779705a5547454e376a3071326a704d75396f514561444449344e7a49794d7a59334e7a49374d54436e68594b652f502f2f2f2f3842227d; uc1=cookie21=WqG3DMC9Fb5mPLIQo9kR&cookie15=U%2BGCWk%2F75gdr5Q%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&pas=0&cookie14=UoexN9HKXq8yng%3D%3D&existShop=false; JSESSIONID=EEB8C1AFC7631766E44A69C8115CE63E; tfstk=c0RlB0T15LW5i3WDCb1S9hxMdKMAZOkPVBR2g48q0ZEwsCdViY24Q_Hxogns4J1..; l=eB_nSsCgL61keOgSBOfwourza77OSIRAguPzaNbMiOCPOQfp57yhW6j6oi89C3GVh6jXR3SVatiMBeYBqIv4n5U62j-la_kmn; isg=BAwM2lIJgvY49paKRzCa4iI_3Wo-RbDvbc26u2bNGLda8az7jlWAfwJDkflJuehH"
cookies = {}
# 提取键值对 请求头中携带cookie必须是一个字典,所以要把原生的cookie字符串转换成cookie字典
for cookie in cookie.split(';'):
key, value = cookie.split("=", 1)
cookies[key] = value
for url in urls:
yield scrapy.Request(url=url,cookies=cookies, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = f'quotes-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
self.log(f'标题:{response.css("title::text").get()}')
# 取出后转换为字典
goods_data_dict = json.loads(re.search("g_page_config = ({.*?});", response.text).group(1))
# 取出想要的具体内容(为列表)
goods_list = goods_data_dict.get("mods").get('itemlist').get('data').get("auctions")
# 循环刚刚得到的列表 取出数据保存到刚刚修改的item.py 存储对象中
for goods in goods_list:
item = TaobaoItem()
item["shop_name"] = goods.get("nick")
item["title"] = goods.get("raw_title")
item["address"] = goods.get("item_loc")
item["price"] = goods.get('view_price')
item["buy_num"] = goods.get("view_sales")
item["pic_url"] = 'http:' + str(goods.get("pic_url"))
item["detail_url"] = 'http:' + str(goods.get("detail_url"))
print("-+-" * 30)
self.log(item)
# 返回数据
yield item
可在settings.py配置默认请求
DEFAULT_REQUEST_HEADERS = {
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
"Cookie": "_m_h5_tk=1cd8de065945ab3f4e341f304d19530a_1655271188481; _m_h5_tk_enc=145e0d75865f74fda590f8a40fd14439; xlly_s=1; _samesite_flag_=true; cookie2=12d5ef85f124f5aaddf7004995927c61; t=473c6653df68764bd121e75ac78fd73d; _tb_token_=e3ee5ee3e446e; enc=xCQgeIGFP2zROdOi0aUHn%2B3XDvyLWqv9YAH03OYeOx6DpFPlJnJaj8qZ%2FFRJE5Ju9tIZQlaYjVB2Nwf8blEKVg%3D%3D; cna=qZMcGwBL1RMCATz3G4XpiNng; sgcookie=E100LTrACKM2RljYFFXiRoUwBXZEq8B6lMyepJEKAOUoA19BuRdrypVbMGbcV8w1ndrf2mSIK3dcsgoH8BsCAxgTfdGsuvoLXZts%2BTjMpkFCWthlxf3FjHguqTMjpYSjx1pE; unb=2872236772; uc3=id2=UUBfQCLoMk06dA%3D%3D&vt3=F8dCvC3%2BnkyPCKu0dGE%3D&nk2=piIK%2FC31Ly%2Fo6YTR6z9qIW0%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; csg=84c47a65; lgc=%5Cu8BFA%5Cu8A001314055453562; cancelledSubSites=empty; cookie17=UUBfQCLoMk06dA%3D%3D; dnk=%5Cu8BFA%5Cu8A001314055453562; skt=75db6b0a8447c001; existShop=MTY1NTI2NDE5MQ%3D%3D; uc4=nk4=0%40pO1rSxGmF71fehdQkeWsJFCs%2BJnHSb9RazYJ%2Bg%3D%3D&id4=0%40U2LNbzraVXqP3MK77TSI0odX0RtA; tracknick=%5Cu8BFA%5Cu8A001314055453562; _cc_=URm48syIZQ%3D%3D; _l_g_=Ug%3D%3D; sg=223; _nk_=%5Cu8BFA%5Cu8A001314055453562; cookie1=BxvBC3XB0E2eMMNFh3Ug1JLf3kITdQy%2B%2BOmAkd14sL4%3D; JSESSIONID=74834BB2A34122DB2A62AE802EC3C6C7; mt=ci=45_1; uc1=cookie21=U%2BGCWk%2F7p4mBoUyS4E9C&existShop=false&cookie14=UoexN9HKXqh%2BQg%3D%3D&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D; thw=cn; tfstk=cAtOBdiSNXcif_WpUF3h0opFDrHhZkyOiR1YDn162N6m36rAiFxk27pm5t5P6vC..; l=eB_nSsCgL61keTmsBOfwnurza77t_IRAguPzaNbMiOCP_Wfp5YiFW6j68_Y9CnGVh6bwR3SVatiMBeYBqIv4n5U62j-latDmn; isg=BOPj1BdJhWP2lklXVP19t4FacieN2HcabkjFRhVAI8K5VAN2nalIatSGTiTadM8SM"
}