-
Notifications
You must be signed in to change notification settings - Fork 0
/
postprocess.py
58 lines (50 loc) · 1.88 KB
/
postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import re
import cv2
import neuspell
import numpy as np
from neuspell import SclstmChecker
from config import LOG
class PostProcessor:
def __init__(self, CFG):
self.CHARS = CFG.chars
# remove emplty lines from the txt recognized by Tesseract
def removeEmptyLines(self, txt):
LOG.logger.info("<<<< Removing Empty Lines >>>>")
lines = txt.split('\n')
non_empty_lines = [line for line in lines if line.strip() != ""]
string_without_empty_lines = ""
for line in non_empty_lines:
string_without_empty_lines += line.replace('\n', '').replace('\r', '').replace('\n+e', '') + "\n"
return string_without_empty_lines
# Remove Undefined Characters, replacing @ by a and
# replacing some characters with .
def cleanText(self, txt):
LOG.logger.info("<<<< Cleaning Text >>>>")
txt = re.sub(self.CHARS, ' ', txt)
txt2 = ''
for idx, i in enumerate(txt):
if i == '@':
txt2+= 'a'
continue
if idx < 4:
txt2 += i
continue
if txt2[idx-4:idx] == '....' and txt[idx] != '.' and txt[idx] != ' ':
txt2 += '.'
else:
txt2 += i
return txt2
# Use Bert checker to fix spelling errors see https://github.com/neuspell/neuspell
def spellingCheck(self, txt):
LOG.logger.info("<<<< Check Spelling Using Bert >>>>")
checker = SclstmChecker()
checker = checker.add_("bert", at="output") # "elmo" or "bert", "input" or "output"
checker.from_pretrained()
return checker.correct(txt)
def postprocess(self, txt):
LOG.logger.info("<<<< Start Postprocessing >>>>")
txt = self.removeEmptyLines(txt)
txt = self.cleanText(txt)
#txt = self.spellingCheck(txt)
return txt