Skip to content

Commit

Permalink
添加注释
Browse files Browse the repository at this point in the history
  • Loading branch information
Lewis Chen authored and chl17 committed Feb 15, 2019
1 parent ffa1f0f commit 1704c02
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 274 deletions.
6 changes: 1 addition & 5 deletions 发改委/NDRC/OCR.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# 百度图像识别引擎,在发改委爬虫中未使用
from aip import AipOcr

config = {
Expand Down Expand Up @@ -25,8 +26,3 @@ def img_to_str_net(url):
result = client.basicGeneralUrl(url)
if 'words_result' in result:
return ' '.join([w['words'] for w in result['words_result']])


# print(type(img_to_str_net('http://zfxxgk.nea.gov.cn/auto93/201806/W020180629331250434030.jpg')))
# print(img_to_str('/Users/chenhaolin/PycharmProjects/SRT/IMAGES/full/国家能源局_first 国务院扶贫办关于下达十三五”第一批光伏扶贫项目计划的通知/W020180104619948696131.jpg'))

9 changes: 6 additions & 3 deletions 发改委/NDRC/es_operation.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# elasticsearch API接口

from datetime import datetime
from elasticsearch_dsl import Date, Nested, Boolean, \
analyzer, Completion, Keyword, Text, Integer, Document

# 更多字段类型见第三百六十四节elasticsearch(搜索引擎)的mapping映射管理

from elasticsearch_dsl.connections import connections # 导入连接elasticsearch(搜索引擎)服务器方法
connections.create_connection(hosts=['127.0.0.1'])
connections.create_connection(hosts=['127.0.0.1']) # 连接 elasticsearch


def gen_suggests(es_connection, index, info_tuple):
def gen_suggests(es_connection, index, info_tuple): # 生成搜索建议
es = es_connection
# 根据字符串生成搜索建议数组
used_words = set()
Expand All @@ -26,9 +28,10 @@ def gen_suggests(es_connection, index, info_tuple):

if new_words:
suggests.append({"input": list(new_words), "weight": weight})
return suggests # 返回字典
return suggests # 返回为字典格式


# 生成一个要写入 es 的条目
class ndrcType(Document): # 自定义一个类来继承DocType类
# Text类型需要分词,所以需要知道中文分词器,ik_max_wordwei为中文分词器
name = "国家发改委" # 国家发改委
Expand Down
10 changes: 5 additions & 5 deletions 发改委/NDRC/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
es_connection = connections.create_connection(hosts=['127.0.0.1'])


class NDRCItemLoader(ItemLoader):
class NDRCItemLoader(ItemLoader): # 实现写入 es 的类(作用未知,应为数据处理函数TakeFirst)
default_output_processor = TakeFirst()


def addval(value): # 自定义数据预处理函数
return value # 将处理后的数据返给Item
return value # 将处理后的数据返给Item,现在不作处理直接返回


class NdrcItem(scrapy.Item):
class NdrcItem(scrapy.Item): # 发改委 item
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field(input_processor=MapCompose(addval)) # 接收爬虫获取到的title信息 inputprocessor 处理数据后返回
Expand All @@ -41,7 +41,7 @@ class NdrcItem(scrapy.Item):
class3 = scrapy.Field()
website = scrapy.Field()

def save_to_es(self):
def save_to_es(self): # 写入 es 的函数
ndrc_element = ndrcType() # 实例化elasticsearch(搜索引擎对象)
ndrc_element.title = self['title'] # 字段名称=值
ndrc_element.content = self['content']
Expand All @@ -59,7 +59,7 @@ def save_to_es(self):
ndrc_element.class3 = self['class3']
ndrc_element.website = self['website']

if self['file_content']:
if self['file_content']: # 根据 title, content, file_content 的权重生成搜索建议
ndrc_element.suggest = gen_suggests(es_connection, ndrcType.name,
((ndrc_element.title, 10), (ndrc_element.file_content, 7),
(ndrc_element.content, 8))) # ndrcType.Index.name
Expand Down
2 changes: 1 addition & 1 deletion 发改委/NDRC/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

# 爬虫中间件
from scrapy import signals


Expand Down
254 changes: 127 additions & 127 deletions 发改委/NDRC/pdf2txt.py
Original file line number Diff line number Diff line change
@@ -1,127 +1,127 @@
#!/usr/bin/env python

"""
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
"""
import sys
import logging
import six
import pdfminer.settings
pdfminer.settings.STRICT = False
import pdfminer.high_level
import pdfminer.layout
from pdfminer.image import ImageWriter


def extract_text(files=[], outfile='-',
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
output_type='text', codec='utf-8', strip_control=False,
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
layoutmode='normal', output_dir=None, debug=False,
disable_caching=False, **other):
if _py2_no_more_posargs is not None:
raise ValueError("Too many positional arguments passed.")
if not files:
raise ValueError("Must provide files to work upon!")

# If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
else:
laparams = None

imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)

if output_type == "text" and outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if outfile.endswith(override):
output_type = alttype

if outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
codec = 'utf-8'
else:
outfp = open(outfile, "wb")


for fname in files:
with open(fname, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp

# main
def main(args=None):
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args)

if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])

imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)

if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding)

if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml" ),
(".tag", "tag" ) ):
if A.outfile.endswith(override):
A.output_type = alttype

if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
# Why ignore outfp.encoding? :-/ stupid cathal?
A.codec = 'utf-8'
else:
outfp = open(A.outfile, "wb")

## Test Code
outfp = extract_text(**vars(A))
outfp.close()
return 0


if __name__ == '__main__': sys.exit(main())
#!/usr/bin/env python
# 读取PDF程序,使用方法:"pdf2txt.py PDF文件地址",控制台输出解析结果
"""
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
"""
import sys
import logging
import six
import pdfminer.settings
pdfminer.settings.STRICT = False
import pdfminer.high_level
import pdfminer.layout
from pdfminer.image import ImageWriter


def extract_text(files=[], outfile='-',
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
output_type='text', codec='utf-8', strip_control=False,
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
layoutmode='normal', output_dir=None, debug=False,
disable_caching=False, **other):
if _py2_no_more_posargs is not None:
raise ValueError("Too many positional arguments passed.")
if not files:
raise ValueError("Must provide files to work upon!")

# If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
else:
laparams = None

imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)

if output_type == "text" and outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if outfile.endswith(override):
output_type = alttype

if outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
codec = 'utf-8'
else:
outfp = open(outfile, "wb")


for fname in files:
with open(fname, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp

# main
def main(args=None):
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args)

if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])

imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)

if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding)

if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml" ),
(".tag", "tag" ) ):
if A.outfile.endswith(override):
A.output_type = alttype

if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
# Why ignore outfp.encoding? :-/ stupid cathal?
A.codec = 'utf-8'
else:
outfp = open(A.outfile, "wb")

## Test Code
outfp = extract_text(**vars(A))
outfp.close()
return 0


if __name__ == '__main__': sys.exit(main())
13 changes: 6 additions & 7 deletions 发改委/NDRC/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ def __init__(self, mongo_uri, mongo_db):
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_uri=crawler.settings.get('MONGO_URI'), # 从设置settings.py获取设置信息方法
mongo_db=crawler.settings.get('MONGO_DB')
)

def open_spider(self, spider):

def open_spider(self, spider): # 连接mongoDB
CollectionName = '测试' # + date_time
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
Expand All @@ -41,15 +40,15 @@ def open_spider(self, spider):
def process_item(self, item, spider):
if isinstance(item, NdrcItem):
data = dict(item)
self.Collection.insert(data)
self.Collection.insert(data) # 保存到 mongoDB
item.save_to_es() # 保存到 es
return item

def close_spider(self, spider):
self.client.close()
self.client.close() # 关闭mongoDB连接


class FilePipeline(FilesPipeline):
class FilePipeline(FilesPipeline): # 文件下载处理

def file_path(self, request, response=None, info=None):
item = request.meta['item']
Expand Down Expand Up @@ -77,7 +76,7 @@ def get_media_requests(self, item, info):
yield scrapy.Request(file_address, meta={'item': item, 'referer': referer})


class ImagePipeline(ImagesPipeline):
class ImagePipeline(ImagesPipeline): # 图像下载处理

def file_path(self, request, response=None, info=None):
item = request.meta['item']
Expand Down
1 change: 1 addition & 0 deletions 发改委/NDRC/runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Scrapy运行程序
from scrapy.crawler import CrawlerProcess
from 发改委.NDRC.spiders.general import *
from scrapy.utils.project import get_project_settings
Expand Down
Loading

0 comments on commit 1704c02

Please sign in to comment.