-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
增加溯源Node
- Loading branch information
Showing
20 changed files
with
616 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -257,4 +257,7 @@ dmypy.json | |
# docusaurus | ||
.docusaurus/ | ||
|
||
sftp-config.json | ||
|
||
/tmp/* | ||
sftp-config.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,5 @@ | ||
from bisheng_langchain.chains.combine_documents.stuff import StuffDocumentsChain | ||
from bisheng_langchain.chains.retrieval_qa.base import MultiRetrievalQA | ||
|
||
__all__ = [ | ||
'StuffDocumentsChain', | ||
'MultiRetrievalQA', | ||
] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 2 additions & 1 deletion
3
src/bisheng-langchain/bisheng_langchain/document_loaders/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .elem_pdf import PDFWithSemanticLoader | ||
from .elem_unstrcutured_loader import ElemUnstructuredLoader, ElemUnstructuredLoaderV0 | ||
|
||
__all__ = ['PDFWithSemanticLoader'] | ||
__all__ = ['PDFWithSemanticLoader', 'ElemUnstructuredLoader', 'ElemUnstructuredLoaderV0'] |
149 changes: 149 additions & 0 deletions
149
src/bisheng-langchain/bisheng_langchain/document_loaders/elem_unstrcutured_loader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
# flake8: noqa | ||
"""Loads PDF with semantic splilter.""" | ||
import base64 | ||
import io | ||
import json | ||
import logging | ||
import os | ||
import re | ||
import tempfile | ||
import time | ||
from abc import ABC | ||
from collections import Counter | ||
from copy import deepcopy | ||
from pathlib import Path | ||
from typing import Any, Iterator, List, Mapping, Optional, Union | ||
from urllib.parse import urlparse | ||
|
||
import fitz | ||
import numpy as np | ||
import pypdfium2 | ||
import requests | ||
from bisheng_langchain.document_loaders.parsers import LayoutParser | ||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.blob_loaders import Blob | ||
from langchain.document_loaders.pdf import BasePDFLoader | ||
from shapely import Polygon | ||
from shapely import box as Rect | ||
|
||
|
||
def merge_partitions(partitions): | ||
text_elem_sep = '\n' | ||
doc_content = [] | ||
is_first_elem = True | ||
last_label = '' | ||
prev_length = 0 | ||
metadata = dict(bboxes=[], pages=[], indexes=[], types=[]) | ||
for part in partitions: | ||
label, text = part['type'], part['text'] | ||
extra_data = part['metadata']['extra_data'] | ||
if is_first_elem: | ||
f_text = text + '\n' if label == 'Title' else text | ||
doc_content.append(f_text) | ||
is_first_elem = False | ||
else: | ||
if last_label == 'Title' and label == 'Title': | ||
doc_content.append('\n' + text + '\n') | ||
elif label == 'Title': | ||
doc_content.append('\n\n' + text + '\n') | ||
elif label == 'Table': | ||
doc_content.append('\n\n' + text + '\n') | ||
else: | ||
doc_content.append(text_elem_sep + text) | ||
|
||
last_label = label | ||
metadata['bboxes'].extend( | ||
list(map(lambda x: list(map(int, x)), extra_data['bboxes']))) | ||
metadata['pages'].extend(extra_data['pages']) | ||
metadata['types'].extend(extra_data['types']) | ||
|
||
indexes = extra_data['indexes'] | ||
up_indexes = [[s + prev_length, e + prev_length] for (s, e) in indexes] | ||
metadata['indexes'].extend(up_indexes) | ||
prev_length += len(doc_content[-1]) | ||
|
||
content = ''.join(doc_content) | ||
return content, metadata | ||
|
||
|
||
class ElemUnstructuredLoader(BasePDFLoader): | ||
"""Loads a PDF with pypdf and chunks at character level. dummy version | ||
Loader also stores page numbers in metadata. | ||
""" | ||
def __init__(self, | ||
file_name: str, | ||
file_path: str, | ||
unstructured_api_key: str = None, | ||
unstructured_api_url: str = None, | ||
start: int = 0, | ||
n: int = None, | ||
verbose: bool = False) -> None: | ||
"""Initialize with a file path.""" | ||
self.unstructured_api_url = unstructured_api_url | ||
self.unstructured_api_key = unstructured_api_key | ||
self.headers = {'Content-Type': 'application/json'} | ||
self.file_name = file_name | ||
self.start = start | ||
self.n = n | ||
super().__init__(file_path) | ||
|
||
|
||
def load(self) -> List[Document]: | ||
"""Load given path as pages.""" | ||
b64_data = base64.b64encode(open(self.file_path, 'rb').read()).decode() | ||
payload = dict( | ||
filename=os.path.basename(self.file_name), | ||
b64_data=[b64_data], | ||
mode='partition', | ||
parameters={'start': self.start, 'n': self.n}) | ||
|
||
resp = requests.post( | ||
self.unstructured_api_url, | ||
headers=self.headers, | ||
json=payload).json() | ||
|
||
partitions = resp['partitions'] | ||
content, metadata = merge_partitions(partitions) | ||
metadata['source'] = self.file_name | ||
|
||
doc = Document(page_content=content, metadata=metadata) | ||
return [doc] | ||
|
||
|
||
class ElemUnstructuredLoaderV0(BasePDFLoader): | ||
"""Loads a PDF with pypdf and chunks at character level. dummy version | ||
Loader also stores page numbers in metadata. | ||
""" | ||
def __init__(self, | ||
file_name : str, | ||
file_path: str, | ||
unstructured_api_key: str = None, | ||
unstructured_api_url: str = None, | ||
start: int = 0, | ||
n: int = None, | ||
verbose: bool = False) -> None: | ||
"""Initialize with a file path.""" | ||
self.unstructured_api_url = unstructured_api_url | ||
self.unstructured_api_key = unstructured_api_key | ||
self.headers = {'Content-Type': 'application/json'} | ||
self.file_name = file_name | ||
super().__init__(file_path) | ||
|
||
def load(self) -> List[Document]: | ||
b64_data = base64.b64encode(open(self.file_path, 'rb').read()).decode() | ||
payload = dict( | ||
filename=os.path.basename(self.file_name), | ||
b64_data=[b64_data], | ||
mode='text') | ||
|
||
resp = requests.post( | ||
self.unstructured_api_url, | ||
headers=self.headers, | ||
json=payload).json() | ||
|
||
page_content = resp['text'] | ||
meta = {'source': self.file_name} | ||
doc = Document(page_content=page_content, metadata=meta) | ||
return [doc] |
5 changes: 5 additions & 0 deletions
5
src/bisheng-langchain/bisheng_langchain/retrievers/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from bisheng_langchain.retrievers.mix_es_vector import MixEsVectorRetriever | ||
|
||
__all__ = [ | ||
"MixEsVectorRetriever" | ||
] |
Oops, something went wrong.