diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 4977e36..2bff26a 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,2 +1,3 @@ * Chris Hager (https://github.com/metachris) * Taranjeet Singh (https://github.com/staranjeet) +* Jaton Justice (https://github.com/masterjx9) \ No newline at end of file diff --git a/pdfx/__init__.py b/pdfx/__init__.py index cc3e3b2..715e2a4 100644 --- a/pdfx/__init__.py +++ b/pdfx/__init__.py @@ -94,7 +94,7 @@ class PDFx(object): reader = None # ReaderBackend summary = {} - def __init__(self, uri): + def __init__(self, uri, references_data_structure="set"): """ Open PDF handle and parse PDF metadata - `uri` can bei either a filename or an url @@ -125,7 +125,7 @@ def __init__(self, uri): # Create ReaderBackend instance try: - self.reader = PDFMinerBackend(self.stream) + self.reader = PDFMinerBackend(self.stream, references_data_structure=references_data_structure) except PDFSyntaxError as e: raise PDFInvalidError("Invalid PDF (%s)" % unicode(e)) @@ -159,13 +159,13 @@ def get_text(self): def get_metadata(self): return self.reader.get_metadata() - def get_references(self, reftype=None, sort=False): + def get_references(self, reftype=None): """ reftype can be `None` for all, `pdf`, etc. """ - return self.reader.get_references(reftype=reftype, sort=sort) + return self.reader.get_references(reftype=reftype) - def get_references_as_dict(self, reftype=None, sort=False): + def get_references_as_dict(self, reftype=None): """ reftype can be `None` for all, `pdf`, etc. """ - return self.reader.get_references_as_dict(reftype=reftype, sort=sort) + return self.reader.get_references_as_dict(reftype=reftype) def get_references_count(self, reftype=None): """ reftype can be `None` for all, `pdf`, etc. """ diff --git a/pdfx/backends.py b/pdfx/backends.py index 94d9053..890dd5c 100644 --- a/pdfx/backends.py +++ b/pdfx/backends.py @@ -124,12 +124,14 @@ class ReaderBackend(object): text = "" metadata = {} - references = set() - def __init__(self): + def __init__(self, references_data_structure="set"): self.text = "" self.metadata = {} - self.references = set() + if references_data_structure == "list": + self.references = list() + else: # default to set + self.references = set() def get_metadata(self): return self.metadata @@ -163,18 +165,18 @@ def metadata_cleanup(self): def get_text(self): return self.text - def get_references(self, reftype=None, sort=False): + def get_references(self, reftype=None): refs = self.references if reftype: refs = set([ref for ref in refs if ref.reftype == "pdf"]) - return sorted(refs) if sort else refs + return refs - def get_references_as_dict(self, reftype=None, sort=False): + def get_references_as_dict(self, reftype=None): ret = {} refs = self.references if reftype: - refs = set([ref for ref in refs if ref.reftype == "pdf"]) - for r in sorted(refs) if sort else refs: + refs = [ref for ref in refs if ref.reftype == reftype] + for r in refs: if r.reftype in ret: ret[r.reftype].append(r.ref) else: @@ -183,8 +185,8 @@ def get_references_as_dict(self, reftype=None, sort=False): class PDFMinerBackend(ReaderBackend): - def __init__(self, pdf_stream, password="", pagenos=[], maxpages=0): # noqa: C901 - ReaderBackend.__init__(self) + def __init__(self, pdf_stream, password="", pagenos=[], maxpages=0, references_data_structure="set"): # noqa: C901 + ReaderBackend.__init__(self, references_data_structure) self.pdf_stream = pdf_stream # Extract Metadata @@ -238,7 +240,10 @@ def __init__(self, pdf_stream, password="", pagenos=[], maxpages=0): # noqa: C9 if isinstance(refs, list): for ref in refs: if ref: - self.references.add(ref) + if references_data_structure == "list": + self.references.append(ref) + else: + self.references.add(ref) elif isinstance(refs, Reference): self.references.add(refs)