添加注释

zhuyoucai168 · Feb 15, 2019 · 1704c02 · 1704c02
1 parent ffa1f0f
commit 1704c02
Show file tree

Hide file tree

Showing 11 changed files with 177 additions and 274 deletions.
diff --git a/发改委/NDRC/OCR.py b/发改委/NDRC/OCR.py
@@ -1,3 +1,4 @@
+# 百度图像识别引擎，在发改委爬虫中未使用
 from aip import AipOcr
 
 config = {
@@ -25,8 +26,3 @@ def img_to_str_net(url):
     result = client.basicGeneralUrl(url)
     if 'words_result' in result:
         return ' '.join([w['words'] for w in result['words_result']])
-
-
-# print(type(img_to_str_net('http://zfxxgk.nea.gov.cn/auto93/201806/W020180629331250434030.jpg')))
-# print(img_to_str('/Users/chenhaolin/PycharmProjects/SRT/IMAGES/full/国家能源局_first 国务院扶贫办关于下达十三五”第一批光伏扶贫项目计划的通知/W020180104619948696131.jpg'))
-
diff --git a/发改委/NDRC/es_operation.py b/发改委/NDRC/es_operation.py
@@ -1,14 +1,16 @@
+# elasticsearch API接口
+
 from datetime import datetime
 from elasticsearch_dsl import Date, Nested, Boolean, \
     analyzer, Completion, Keyword, Text, Integer, Document
 
 # 更多字段类型见第三百六十四节elasticsearch(搜索引擎)的mapping映射管理
 
 from elasticsearch_dsl.connections import connections       # 导入连接elasticsearch(搜索引擎)服务器方法
-connections.create_connection(hosts=['127.0.0.1'])
+connections.create_connection(hosts=['127.0.0.1'])          # 连接 elasticsearch
 
 
-def gen_suggests(es_connection, index, info_tuple):
+def gen_suggests(es_connection, index, info_tuple):         # 生成搜索建议
     es = es_connection
     # 根据字符串生成搜索建议数组
     used_words = set()
@@ -26,9 +28,10 @@ def gen_suggests(es_connection, index, info_tuple):
 
         if new_words:
             suggests.append({"input": list(new_words), "weight": weight})
-    return suggests  # 返回字典
+    return suggests  # 返回为字典格式
 
 
+# 生成一个要写入 es 的条目
 class ndrcType(Document):                                                   # 自定义一个类来继承DocType类
     # Text类型需要分词，所以需要知道中文分词器，ik_max_wordwei为中文分词器
     name = "国家发改委"           # 国家发改委

diff --git a/发改委/NDRC/items.py b/发改委/NDRC/items.py
@@ -13,15 +13,15 @@
 es_connection = connections.create_connection(hosts=['127.0.0.1'])
 
 
-class NDRCItemLoader(ItemLoader):
+class NDRCItemLoader(ItemLoader):  # 实现写入 es 的类（作用未知，应为数据处理函数TakeFirst）
     default_output_processor = TakeFirst()
 
 
 def addval(value):                                 # 自定义数据预处理函数
-    return value                                    # 将处理后的数据返给Item
+    return value                                    # 将处理后的数据返给Item，现在不作处理直接返回
 
 
-class NdrcItem(scrapy.Item):
+class NdrcItem(scrapy.Item):        # 发改委 item
     # define the fields for your item here like:
     # name = scrapy.Field()
     title = scrapy.Field(input_processor=MapCompose(addval))  # 接收爬虫获取到的title信息 inputprocessor 处理数据后返回
@@ -41,7 +41,7 @@ class NdrcItem(scrapy.Item):
     class3 = scrapy.Field()
     website = scrapy.Field()
 
-    def save_to_es(self):
+    def save_to_es(self):           # 写入 es 的函数
         ndrc_element = ndrcType()  # 实例化elasticsearch(搜索引擎对象)
         ndrc_element.title = self['title']  # 字段名称=值
         ndrc_element.content = self['content']
@@ -59,7 +59,7 @@ def save_to_es(self):
         ndrc_element.class3 = self['class3']
         ndrc_element.website = self['website']
 
-        if self['file_content']:
+        if self['file_content']:          # 根据 title， content， file_content 的权重生成搜索建议
             ndrc_element.suggest = gen_suggests(es_connection, ndrcType.name,
                                                 ((ndrc_element.title, 10), (ndrc_element.file_content, 7),
                                                  (ndrc_element.content, 8)))  # ndrcType.Index.name

diff --git a/发改委/NDRC/middlewares.py b/发改委/NDRC/middlewares.py
@@ -4,7 +4,7 @@
 #
 # See documentation in:
 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
+# 爬虫中间件
 from scrapy import signals
 
 

diff --git a/发改委/NDRC/pdf2txt.py b/发改委/NDRC/pdf2txt.py
@@ -1,127 +1,127 @@
-#!/usr/bin/env python
-
-"""
-Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
-"""
-import sys
-import logging
-import six
-import pdfminer.settings
-pdfminer.settings.STRICT = False
-import pdfminer.high_level
-import pdfminer.layout
-from pdfminer.image import ImageWriter
-
-
-def extract_text(files=[], outfile='-',
-            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
-            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
-            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
-            output_type='text', codec='utf-8', strip_control=False,
-            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
-            layoutmode='normal', output_dir=None, debug=False,
-            disable_caching=False, **other):
-    if _py2_no_more_posargs is not None:
-        raise ValueError("Too many positional arguments passed.")
-    if not files:
-        raise ValueError("Must provide files to work upon!")
-
-    # If any LAParams group arguments were passed, create an LAParams object and
-    # populate with given args. Otherwise, set it to None.
-    if not no_laparams:
-        laparams = pdfminer.layout.LAParams()
-        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
-            paramv = locals().get(param, None)
-            if paramv is not None:
-                setattr(laparams, param, paramv)
-    else:
-        laparams = None
-
-    imagewriter = None
-    if output_dir:
-        imagewriter = ImageWriter(output_dir)
-
-    if output_type == "text" and outfile != "-":
-        for override, alttype in (  (".htm", "html"),
-                                    (".html", "html"),
-                                    (".xml", "xml"),
-                                    (".tag", "tag") ):
-            if outfile.endswith(override):
-                output_type = alttype
-
-    if outfile == "-":
-        outfp = sys.stdout
-        if outfp.encoding is not None:
-            codec = 'utf-8'
-    else:
-        outfp = open(outfile, "wb")
-
-
-    for fname in files:
-        with open(fname, "rb") as fp:
-            pdfminer.high_level.extract_text_to_fp(fp, **locals())
-    return outfp
-
-# main
-def main(args=None):
-    import argparse
-    P = argparse.ArgumentParser(description=__doc__)
-    P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
-    P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
-    P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
-    P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
-    P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
-    P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
-    P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
-    P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
-    P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
-    P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
-    P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
-    P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
-    P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
-    P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
-    P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
-    P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
-    P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
-    P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
-    P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
-    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
-    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
-    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
-    A = P.parse_args(args=args)
-
-    if A.page_numbers:
-        A.page_numbers = set([x-1 for x in A.page_numbers])
-    if A.pagenos:
-        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
-
-    imagewriter = None
-    if A.output_dir:
-        imagewriter = ImageWriter(A.output_dir)
-
-    if six.PY2 and sys.stdin.encoding:
-        A.password = A.password.decode(sys.stdin.encoding)
-
-    if A.output_type == "text" and A.outfile != "-":
-        for override, alttype in (  (".htm",  "html"),
-                                    (".html", "html"),
-                                    (".xml",  "xml" ),
-                                    (".tag",  "tag" ) ):
-            if A.outfile.endswith(override):
-                A.output_type = alttype
-
-    if A.outfile == "-":
-        outfp = sys.stdout
-        if outfp.encoding is not None:
-            # Why ignore outfp.encoding? :-/ stupid cathal?
-            A.codec = 'utf-8'
-    else:
-        outfp = open(A.outfile, "wb")
-
-    ## Test Code
-    outfp = extract_text(**vars(A))
-    outfp.close()
-    return 0
-
-
-if __name__ == '__main__': sys.exit(main())
+#!/usr/bin/env python
+# 读取PDF程序，使用方法："pdf2txt.py PDF文件地址"，控制台输出解析结果
+"""
+Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
+"""
+import sys
+import logging
+import six
+import pdfminer.settings
+pdfminer.settings.STRICT = False
+import pdfminer.high_level
+import pdfminer.layout
+from pdfminer.image import ImageWriter
+
+
+def extract_text(files=[], outfile='-',
+            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
+            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
+            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
+            output_type='text', codec='utf-8', strip_control=False,
+            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
+            layoutmode='normal', output_dir=None, debug=False,
+            disable_caching=False, **other):
+    if _py2_no_more_posargs is not None:
+        raise ValueError("Too many positional arguments passed.")
+    if not files:
+        raise ValueError("Must provide files to work upon!")
+
+    # If any LAParams group arguments were passed, create an LAParams object and
+    # populate with given args. Otherwise, set it to None.
+    if not no_laparams:
+        laparams = pdfminer.layout.LAParams()
+        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
+            paramv = locals().get(param, None)
+            if paramv is not None:
+                setattr(laparams, param, paramv)
+    else:
+        laparams = None
+
+    imagewriter = None
+    if output_dir:
+        imagewriter = ImageWriter(output_dir)
+
+    if output_type == "text" and outfile != "-":
+        for override, alttype in (  (".htm", "html"),
+                                    (".html", "html"),
+                                    (".xml", "xml"),
+                                    (".tag", "tag") ):
+            if outfile.endswith(override):
+                output_type = alttype
+
+    if outfile == "-":
+        outfp = sys.stdout
+        if outfp.encoding is not None:
+            codec = 'utf-8'
+    else:
+        outfp = open(outfile, "wb")
+
+
+    for fname in files:
+        with open(fname, "rb") as fp:
+            pdfminer.high_level.extract_text_to_fp(fp, **locals())
+    return outfp
+
+# main
+def main(args=None):
+    import argparse
+    P = argparse.ArgumentParser(description=__doc__)
+    P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
+    P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
+    P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
+    P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
+    P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
+    P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
+    P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
+    P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
+    P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
+    P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
+    P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
+    P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
+    P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
+    P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
+    P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
+    P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
+    P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
+    P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
+    P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
+    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
+    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
+    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
+    A = P.parse_args(args=args)
+
+    if A.page_numbers:
+        A.page_numbers = set([x-1 for x in A.page_numbers])
+    if A.pagenos:
+        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
+
+    imagewriter = None
+    if A.output_dir:
+        imagewriter = ImageWriter(A.output_dir)
+
+    if six.PY2 and sys.stdin.encoding:
+        A.password = A.password.decode(sys.stdin.encoding)
+
+    if A.output_type == "text" and A.outfile != "-":
+        for override, alttype in (  (".htm",  "html"),
+                                    (".html", "html"),
+                                    (".xml",  "xml" ),
+                                    (".tag",  "tag" ) ):
+            if A.outfile.endswith(override):
+                A.output_type = alttype
+
+    if A.outfile == "-":
+        outfp = sys.stdout
+        if outfp.encoding is not None:
+            # Why ignore outfp.encoding? :-/ stupid cathal?
+            A.codec = 'utf-8'
+    else:
+        outfp = open(A.outfile, "wb")
+
+    ## Test Code
+    outfp = extract_text(**vars(A))
+    outfp.close()
+    return 0
+
+
+if __name__ == '__main__': sys.exit(main())
diff --git a/发改委/NDRC/pipelines.py b/发改委/NDRC/pipelines.py
@@ -27,12 +27,11 @@ def __init__(self, mongo_uri, mongo_db):
     @classmethod
     def from_crawler(cls, crawler):
         return cls(
-            mongo_uri=crawler.settings.get('MONGO_URI'),
+            mongo_uri=crawler.settings.get('MONGO_URI'),        # 从设置settings.py获取设置信息方法
             mongo_db=crawler.settings.get('MONGO_DB')
         )
 
-    def open_spider(self, spider):
-
+    def open_spider(self, spider):      # 连接mongoDB
         CollectionName = '测试'  # + date_time
         self.client = pymongo.MongoClient(self.mongo_uri)
         self.db = self.client[self.mongo_db]
@@ -41,15 +40,15 @@ def open_spider(self, spider):
     def process_item(self, item, spider):
         if isinstance(item, NdrcItem):
             data = dict(item)
-            self.Collection.insert(data)
+            self.Collection.insert(data)        # 保存到 mongoDB
             item.save_to_es()                   # 保存到 es
             return item
 
     def close_spider(self, spider):
-        self.client.close()
+        self.client.close()             # 关闭mongoDB连接
 
 
-class FilePipeline(FilesPipeline):
+class FilePipeline(FilesPipeline):      # 文件下载处理
 
     def file_path(self, request, response=None, info=None):
         item = request.meta['item']
@@ -77,7 +76,7 @@ def get_media_requests(self, item, info):
                 yield scrapy.Request(file_address, meta={'item': item, 'referer': referer})
 
 
-class ImagePipeline(ImagesPipeline):
+class ImagePipeline(ImagesPipeline):        # 图像下载处理
 
     def file_path(self, request, response=None, info=None):
         item = request.meta['item']

diff --git a/发改委/NDRC/runner.py b/发改委/NDRC/runner.py
@@ -1,3 +1,4 @@
+# Scrapy运行程序
 from scrapy.crawler import CrawlerProcess
 from 发改委.NDRC.spiders.general import *
 from scrapy.utils.project import get_project_settings