Skip to content

Commit

Permalink
Fusionner les images qui se chevauchent (aussi le texte) (#5)
Browse files Browse the repository at this point in the history
* refactor: initial refactoring of analyser

* refactor: remove now unused argument

* feat: extract with new image merging (looks better!)

* fix: fix:tests

* fix: images in html + isort (oops)

* ci: test with CI

* fix: add some types

* ci: oh ffs yaml

* ci: fix name

* fix: some things

* fix(ci): install types

* fix(ci): install

* fix: types types types

* fix(ci): wtf mypy

* fix(ci): wtf mypy

* fix(ci): argh ugh asdfasdfszdfa
  • Loading branch information
dhdaines authored Nov 20, 2023
1 parent 4f63b9d commit 18dc2d2
Show file tree
Hide file tree
Showing 26 changed files with 654 additions and 601 deletions.
71 changes: 71 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Tests

on: [pull_request]

jobs:
lint:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"

- name: Configure pip caching
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt')}}-${{ hashFiles('**/requirements-dev.txt') }}

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt
pip install -e .
- name: Validate against psf/black
run: python -m black --check alexi test scripts

- name: Validate against isort
run: python -m isort --profile black --check-only alexi test scripts

- name: Validate against flake8
run: python -m flake8 alexi test scripts

- name: Check type annotations via mypy
run: python -m mypy alexi test scripts

test:
needs: lint
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"

- name: Configure pip caching
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt')}}-${{ hashFiles('**/requirements-dev.txt') }}

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt
pip install -e .
- name: Run tests
run: |
python -m pytest --cov alexi
python -m coverage html
- name: Upload code coverage
uses: codecov/codecov-action@v3
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ __pycache__
*.egg-info/
.ipynb_checkpoints
indexdir/
ville.sainte-adele.qc.ca/
download/
export/
notebooks/
.coverage
84 changes: 49 additions & 35 deletions alexi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,25 @@

import argparse
import csv
import dataclasses
import itertools
import json
import logging
import re
import subprocess
import operator
import sys
from pathlib import Path
from typing import Any, Iterable, TextIO

from bs4 import BeautifulSoup

from .analyse import Analyseur
from .convert import FIELDNAMES, Converteur
from . import download, extract
from .analyse import Analyseur, Bloc
from .convert import FIELDNAMES, Converteur, merge_overlaps
from .format import format_dict, format_html, format_xml
from .index import index
from .label import DEFAULT_MODEL as DEFAULT_LABEL_MODEL
from .label import Extracteur
from .search import search
from .segment import DEFAULT_MODEL as DEFAULT_SEGMENT_MODEL
from .segment import Segmenteur
from . import extract, download

LOGGER = logging.getLogger("alexi")

Expand All @@ -38,7 +37,7 @@ def write_csv(
writer.writerows(doc)


def convert_main(args):
def convert_main(args: argparse.Namespace):
"""Convertir les PDF en CSV"""
if args.pages:
pages = [max(0, int(x) - 1) for x in args.pages.split(",")]
Expand All @@ -47,68 +46,80 @@ def convert_main(args):
conv = Converteur(args.pdf)
if args.images is not None:
args.images.mkdir(parents=True, exist_ok=True)
images = {}
for bloc in conv.extract_images(pages):
images.setdefault(bloc.page_number, []).append(bloc.img)
img = (
conv.pdf.pages[bloc.page_number - 1]
.crop(bloc.bbox)
.to_image(resolution=150, antialias=True)
)
LOGGER.info("Extraction de %s", args.images / bloc.img)
img.save(args.images / bloc.img)
images: list[dict] = []
for page_number, group in itertools.groupby(
conv.extract_images(pages), operator.attrgetter("page_number")
):
merged = merge_overlaps(group)
for bloc in merged:
images.append(dataclasses.asdict(bloc))
img = (
conv.pdf.pages[bloc.page_number - 1]
.crop(bloc.bbox)
.to_image(resolution=150, antialias=True)
)
LOGGER.info("Extraction de %s", args.images / bloc.img)
img.save(args.images / bloc.img)
with open(args.images / "images.json", "wt") as outfh:
json.dump(images, outfh)
json.dump(images, outfh, indent=2)
write_csv(conv.extract_words(pages), sys.stdout)


def segment_main(args):
def segment_main(args: argparse.Namespace):
"""Segmenter un CSV"""
crf = Segmenteur(args.model)
reader = csv.DictReader(args.csv)
write_csv(crf(reader), sys.stdout)


def label_main(args):
def label_main(args: argparse.Namespace):
"""Étiquetter un CSV"""
crf = Extracteur(args.model)
reader = csv.DictReader(args.csv)
write_csv(crf(reader), sys.stdout)


def xml_main(args):
def xml_main(args: argparse.Namespace):
"""Convertir un CSV segmenté et étiquetté en XML"""
reader = csv.DictReader(args.csv)
doc = Analyseur()(reader)
print(format_xml(doc))
analyseur = Analyseur(args.csv.name, reader)
print(format_xml(analyseur()))


def html_main(args):
def html_main(args: argparse.Namespace):
"""Convertir un CSV segmenté et étiquetté en HTML"""
reader = csv.DictReader(args.csv)
doc = Analyseur()(reader)
print(format_html(doc))
analyseur = Analyseur(args.csv.name, reader)
if args.images is not None:
with open(args.images / "images.json", "rt") as infh:
images = (Bloc(**image_dict) for image_dict in json.load(infh))
analyseur.add_images(images, merge=False)
doc = analyseur()
print(format_html(doc, imgdir=args.images))
else:
doc = analyseur()
print(format_html(doc))


def json_main(args):
def json_main(args: argparse.Namespace):
"""Convertir un CSV segmenté et étiquetté en JSON"""
iob = csv.DictReader(args.csv)
analyseur = Analyseur()
analyseur = Analyseur(args.csv.name, iob)
if args.images:
with open(args.images, "rt") as infh:
images = json.load(infh)
doc = analyseur(iob, images)
with open(args.images / "images.json", "rt") as infh:
images = [Bloc(**image_dict) for image_dict in json.load(infh)]
doc = analyseur(images)
else:
doc = analyseur(iob)
doc = analyseur()
print(json.dumps(format_dict(doc), indent=2, ensure_ascii=False))


def index_main(args):
def index_main(args: argparse.Namespace):
"""Construire un index sur des fichiers JSON"""
index(args.indir, args.outdir)


def search_main(args):
def search_main(args: argparse.Namespace):
"""Lancer une recherche sur l'index"""
search(args.indexdir, args.query)

Expand Down Expand Up @@ -178,6 +189,9 @@ def make_argparse() -> argparse.ArgumentParser:
help="Extraire la structure en format HTML en partant du CSV étiquetté",
)
html.add_argument("csv", help="Fichier CSV à traiter", type=argparse.FileType("rt"))
html.add_argument(
"--images", help="Répertoire avec des images des tableaux", type=Path
)
html.set_defaults(func=html_main)

jsonf = subp.add_parser(
Expand Down
62 changes: 47 additions & 15 deletions alexi/analyse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dataclasses import dataclass, field
from typing import Iterable, Iterator, Optional

from .convert import merge_overlaps
from .types import Bloc, T_obj

LOGGER = logging.getLogger("analyse")
Expand Down Expand Up @@ -173,23 +174,54 @@ def numero(self) -> str:
class Analyseur:
"""Analyse d'un document étiqueté en IOB."""

def __init__(self, fileid: str, words: Iterable[T_obj]):
self.fileid = fileid
self.words: list[T_obj] = list(words)
self.blocs: list[Bloc] = list(group_iob(self.words, "segment"))
self.metadata: dict[str, str] = {}
for bloc in group_iob(self.words, "sequence"):
if bloc.type not in self.metadata:
LOGGER.info(f"{bloc.type}: {bloc.texte}")
self.metadata[bloc.type] = bloc.texte

def add_images(self, images: Iterable[Bloc], merge: bool = True):
"""Insérer les images en les fusionnant avec le texte (et entre elles)
si demandé."""
images_bypage: dict[int, list[Bloc]] = {
page_number: list(group)
for page_number, group in itertools.groupby(
images, operator.attrgetter("page_number")
)
}

# FIXME: assume that we can order things this way!
def bbox_order(bloc):
x0, top, x1, bottom = bloc.bbox
return (top, x0, bottom, x1)

new_blocs: list[Bloc] = []
for page_number, group in itertools.groupby(
self.blocs, operator.attrgetter("page_number")
):
if page_number in images_bypage:
page_blocs = list(group)
page_blocs.extend(images_bypage[page_number])
page_blocs.sort(key=bbox_order)
if merge:
new_blocs.extend(merge_overlaps(page_blocs))
else:
new_blocs.extend(page_blocs)
else:
new_blocs.extend(group)
self.blocs = new_blocs

def __call__(
self,
fileid: str,
words: Iterable[T_obj],
blocs: Optional[Iterable[Bloc]] = None,
) -> Document:
"""Analyse du structure d'un document."""
# Store all inputs as we will do two passes (for sequence and segment tags)
word_sequence = list(words)
# Get metadata from sequence tags
metadata = {}
for bloc in group_iob(word_sequence, "sequence"):
if bloc.type not in metadata:
LOGGER.info(f"{bloc.type}: {bloc.texte}")
metadata[bloc.type] = bloc.texte
titre = metadata.get("Titre", "Document")
numero = metadata.get("Numero", "")
titre = self.metadata.get("Titre", "Document")
numero = self.metadata.get("Numero", "")
if m := re.search(r"(?i:num[ée]ro)\s+([0-9][A-Z0-9-]+)", titre):
LOGGER.info("Numéro extrait du titre: %s", m.group(1))
numero = m.group(1)
Expand All @@ -198,11 +230,11 @@ def __call__(
LOGGER.info("Numéro extrait du titre: %s", m.group(1))
numero = m.group(1)
titre = titre[: m.start(1)] + titre[m.end(1) :]
doc = Document(fileid, numero, titre)
doc.meta = metadata
doc = Document(self.fileid, numero, titre)
doc.meta = self.metadata
# Group block-level text elements by page from segment tags
if blocs is None:
blocs = group_iob(word_sequence)
blocs = self.blocs
for page, blocs in itertools.groupby(blocs, operator.attrgetter("page_number")):
for bloc in blocs:
doc.add_bloc(bloc)
Expand Down
Loading

0 comments on commit 18dc2d2

Please sign in to comment.