博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
投诉网站爬虫
阅读量:6472 次
发布时间:2019-06-23

本文共 2777 字,大约阅读时间需要 9 分钟。

1 # -*- coding: utf-8 -*- 2 import scrapy 3 from yg.items import YgItem 4  5 class YgSpiderSpider(scrapy.Spider): 6     name = 'yg_spider' 7     allowed_domains = ['wz.sun0769.com'] 8     start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0'] 9 10     def parse(self, response):11         tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")12         for tr in tr_list:13             item = YgItem()14             item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first()15             item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first()16             item["update_time"] = tr.xpath("./td[last()]/text()").extract_first()17             # print(item)18 19             yield scrapy.Request(20                 item["href"],21                 callback=self.parse_detail,22                 meta={
"item":item}23 )24 25 next_url = response.xpath("//a[text()='>']/@href").extract_first()26 if next_url is not None:27 yield scrapy.Request(28 next_url,29 callback=self.parse30 )31 32 def parse_detail(self,response): #处理详情页33 item = response.meta["item"]34 item["content"] = response.xpath("//div[@class='c1 text14_2']//text()").extract()35 item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()36 item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]]37 # print(item)38 yield item
1 # -*- coding: utf-8 -*- 2  3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 import re 8 import json 9 10 class YgPipeline(object):11     def process_item(self, item, spider):12         item["content"] = self.process_content(item["content"])13         with open("yg.txt", "a", encoding="utf-8") as f:14             f.write(json.dumps(dict(item), ensure_ascii=False, indent=4))15             f.write("\n")16         return item17 18     def process_content(self, content):19         content = [re.sub(r'\xa0|\s',"",i) for i in content]20         content = [i for i in content if len(i)>0]21         return content
1 # -*- coding: utf-8 -*- 2  3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7  8 import scrapy 9 10 11 class YgItem(scrapy.Item):12     # define the fields for your item here like:13     title = scrapy.Field()14     update_time = scrapy.Field()15     href = scrapy.Field()16     content = scrapy.Field()17     content_img = scrapy.Field()18     # pass

 

转载于:https://www.cnblogs.com/sure-feng/p/10092283.html

你可能感兴趣的文章
Cross Site Request Forgery (CSRF)--spring security -转
查看>>
oracle 性能优化--索引总结
查看>>
获取路径的方法
查看>>
VS2008常见编译错误(总结篇)
查看>>
KeepAlive--高可用解决方案
查看>>
Zsh 开发指南(第八篇 变量修饰语)
查看>>
MySQL并发控制
查看>>
Ionic2入门教程(二)进阶配置:Android打包
查看>>
springboot_demo项目介绍
查看>>
为微信小程序增加mixin扩展
查看>>
JavaScript专题之jQuery通用遍历方法each的实现
查看>>
svg简单的小案例
查看>>
「翻译」新增自订义工具列及按钮
查看>>
spring security运行时配置ignore url
查看>>
用Python实现一个优先级队列(Priority Queue)
查看>>
给自己的Fonts教程续
查看>>
剖析 Laravel 计划任务--创建和运行系统命令
查看>>
让拆库拆表见鬼去吧! MySQL 扩展新玩法
查看>>
Javascript面向对象编程 -- 设计模式
查看>>
用Python多线程实现生产者消费者模式
查看>>