本文主要记录如何基于 Scrapy 获取 中金融分类下的新闻资讯内容。
import scrapy
class NewsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 标题
summary = scrapy.Field() # 主旨
origin = scrapy.Field() # 文章原文链接
cover = scrapy.Field() # 文章封面图片
author = scrapy.Field() # 文章作者
publish_date = scrapy.Field() # 发布日期
publish_time = scrapy.Field() # 发布日期含时间
publish_from = scrapy.Field() # 文章来源
category = scrapy.Field() # 文章所属分类 report,
type = scrapy.Field() # 类型,图片(img)或者文字(txt)
html = scrapy.Field() # html内容
text = scrapy.Field() # 文本内容
image_urls = scrapy.Field() # 正文图片Url地址
images = scrapy.Field() # 正文图片本地地址
# -*- coding: utf-8 -*-
import re
import scrapy
import logging
import datetime
from news.items import NewsItem
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
class Finance21Spider(scrapy.Spider):
name = '21Finance'
allowed_domains = ['www.21jingji.com']
start_urls = ['http://www.21jingji.com/channel/finance/']
current_page = 1
today = str(datetime.date.today())
base_url = "http://www.21jingji.com/channel/finance/{}.html"
has_next = True
def parse(self, response): # 提取最新发布列表页数据
logger.info('当前页数: ' + str(self.current_page))
self.current_page = self.current_page +1
news_list = response.xpath("//div[@id='data_list']/div[@class='li']")
for news in news_list:
content = news.xpath("./div[@class='Tlist']")[0] if len(news.xpath("./div[@class='Tlist']"))>0 else None
if content is not None:
item = NewsItem()
item['category'] = '资讯'
item['type'] = 'TXT'
item['origin'] = news.xpath("./a/@href").extract_first()
item['cover'] = news.xpath("./a/img/@src").extract_first()
item['title'] = content.xpath("./a[@class='listTit']/@title").extract_first()
item['publish_from'] = content.xpath("./p[@class='shuoming']/child::span[1]/text()").extract_first()
item['author'] = content.xpath("./p[@class='shuoming']/child::span[2]/text()").extract_first()
# 获取详情
yield scrapy.Request(
item['origin'],
callback=self.parse_detail,
meta={'item': item})
moreD = response.xpath("//div[@class='moreD']")[0] if len(news.xpath("//div[@class='moreD']"))>0 else None
if moreD is not None:
moreP = moreD.xpath("p[@class='moreP']/a[text()='查看更多']").extract_first()
if moreP is None:
self.has_next = False
if self.has_next:
# 获取下一页地址
next_url = self.base_url.format(str(self.current_page))
yield scrapy.Request(next_url, callback=self.parse)
def parse_detail(self, response): # 提取详情页数据
item = response.meta['item']
wh = response.xpath("//p[@class='Wh']")[0]
retstr = wh.xpath("./span[1]/text()").extract_first()
match = re.search(r"(\d{4}-\d{2}-\d{2})",retstr)
if match:
date_str = match.group(0)
tm = re.search(r"(\s\d{2}:\d{2})",retstr)
time_str = tm.group(0)
else:
date_str = retstr.replace('年', '-').replace('月', '-').replace('日', '')
time_str = wh.xpath("./span[2]/text()").extract_first()
# if date_str == self.today:
if True:
item['publish_date'] = date_str
item['publish_time'] = date_str + " "+ time_str
item['summary'] = response.xpath("//p[@class='abstract backg']/text()").extract_first()
news_html = response.xpath("//div[@class='detailCont']")[0]
texts = news_html.xpath("normalize-space(string(.))").extract()
for i in range(0, len(texts)):
texts[i] = re.sub('\s', ' ', texts[i])
item['text'] = ''.join(texts)
item['html'] = news_html.extract().replace('http://www.21jingji.com/','javascript:void(0);')
yield item
# logger.info(item)
else:
self.has_next = False
# -*- coding: utf-8 -*-
from pymongo import MongoClient
from news.items import NewsItem
class NewsPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls()
def open_spider(self, spider):
self.client = MongoClient(host='172.16.250.238', port=27017)
self.client.test.authenticate('bigdata', 'bigdata')
self.db = self.client['test']
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
collection = self.db['statsNews']
if isinstance(item, NewsItem):
collection.insert_one(dict(item))
# with open('C:\\Users\\pjli\\Desktop\\pyworks\\news.txt', 'w', encoding='utf-8') as f:
# json.dump(dict(item), f, ensure_ascii=False, indent=2)