Skip to content

Commit

Permalink
feat/error_check: add path arguments check and simplify path process. (
Browse files Browse the repository at this point in the history
…#15)

* feat: add exit 1 for exception in cli.

* feat: propagate exception.

* test: add shell test for cli.

* feat: set default folder to `.` (current).

* refactor: better log message.

* refactor: simplify path process and check if file exists.

* chore: add gitpod extension.

* chore: add gitpod extension.
  • Loading branch information
FFengIll authored Mar 11, 2023
1 parent c3596b8 commit e8829eb
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 32 deletions.
4 changes: 2 additions & 2 deletions .gitpod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ tasks:
vscode:
extensions:
- ms-python.python

- cweijan.vscode-office

- ms-python.black-formatter
- eamodio.gitlens
15 changes: 9 additions & 6 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
logger.remove()
logger.add(sys.stderr, level="DEBUG")

if args.input and args.output:
edit_pdf(args.input, args.output, args.ignore)
elif args.indir and args.outdir:
batch_edit_pdf(args.indir, args.outdir, args.ignore)
else:
get_parser().print_help()
try:
if args.input and args.output:
edit_pdf(args.input, args.output, args.ignore)
elif args.indir and args.outdir:
batch_edit_pdf(args.indir, args.outdir, args.ignore)
else:
get_parser().print_help()
except:
exit(1)
5 changes: 2 additions & 3 deletions pdf_white_cut/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def extract_item_box(item):
elif isinstance(item, LTCurve):
logger.debug("use itself: {}", item)
elif isinstance(item, LTTextBox):
logger.warning("NotImplemented and use itself: {}", item)
logger.warning("use itself since NotImplemented: {}", item)
elif isinstance(item, LTTextLine):
# there is 2 types of `LTTextLine`: horizontal and vertical
text = item.get_text().encode("unicode_escape")
Expand Down Expand Up @@ -91,7 +91,7 @@ def extract_item_box(item):
bbox[3] + item.height / 2,
)
elif isinstance(item, LTImage):
logger.warning("NotImplemented and use itself: {}", item)
logger.warning("use itself since NotImplemented: {}", item)
elif isinstance(item, LTFigure):
logger.debug("analyse LTFigure:{}", item)
# for `LTFigure`, the bbox is modified in `PDFMiner`
Expand Down Expand Up @@ -160,7 +160,6 @@ def extract_pdf_boxs(filename, ignore=0):
boxs.append(box)

max_box = get_max_box(boxs)
logger.warning("visible bbox: {}", max_box)
page_boxs.append(max_box)

logger.warning("max visible bbox for the page: {}", max_box)
Expand Down
40 changes: 21 additions & 19 deletions pdf_white_cut/cutter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,21 @@ def edit_page_box(page, visible_box):
logger.info("cut media box to: {}", box)


def edit_pdf(source: str, target: str, ignore=0):
def edit_pdf(source: Path, target: Path, ignore=0):
"""
edit to cut the white slide of the input pdf file, and output a new pdf file.
"""
# guard type
source = Path(source)
target = Path(target)

if source == target:
logger.error("{} {}", source, target)
raise Exception("input and output can not be the same!")

if not source.exists():
raise Exception("input file not exists! ({})".format(source))

try:
# MENTION: never move and change the sequence, since file IO.
# analyses the visible box of each page, aka the box scale. res=[(x1,y1,x2,y2)]
Expand All @@ -66,39 +73,34 @@ def edit_pdf(source: str, target: str, ignore=0):
edit_page_box(page, box)
outpdf.add_page(page)

Path(target).dirname().makedirs_p()
logger.info("output to {}", Path(target))
target.abspath().dirname().makedirs_p()
with open(target, "wb") as outfd:
outpdf.write(outfd)
logger.info("output file: {}", target)

except UnicodeEncodeError as ue:
logger.exception("UnicodeEncodeError while processing file:{}", source)
logger.exception(ue)
raise ue
except Exception as e:
logger.exception("Some other Error while processing file:{}", source)
logger.exception("Some unknown Error while processing file:{}", source)
logger.exception(e)
raise e


def scan_files(folder, glob=""):
"""
scan files under the dir with spec prefix and postfix
"""
files = []
for item in Path(folder).listdir(glob):
item: "Path"
files.append(item.basename())
return files

def batch_edit_pdf(indir: Path, outdir: Path, ignore=0):
# guard type
indir = Path(indir)
outdir = Path(outdir)

def batch_edit_pdf(indir, outdir, ignore=0):
if indir == outdir:
raise Exception("input and output can not be the same!")

files = scan_files(indir, glob="*.pdf")
logger.info(files)
files = [pdf.basename() for pdf in indir.listdir("*.pdf")]
logger.info("pdf files in spec folder: {}", files)

if not os.path.exists(indir):
os.mkdir(indir)
# guard dir
outdir.makedirs_p()

logger.info("input dir: {}", indir)
logger.info("output dir: {}", outdir)
Expand Down
4 changes: 2 additions & 2 deletions pdf_white_cut/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ def get_parser():
"-id",
help="input directory",
action="store",
default="",
default=".",
type=str,
dest="indir",
)
parser.add_argument(
"-od",
help="output directory",
action="store",
default="",
default=".",
type=str,
dest="outdir",
)
Expand Down
4 changes: 4 additions & 0 deletions tests/test_cli.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
python cli.py -i cases/input/input.pdf -o output/output.pdf > /dev/null 2>&1
echo $? "== 0"
python cli.py -i cases/input/non_exist.pdf -o output/output.pdf > /dev/null 2>&1
echo $? "== 1"

0 comments on commit e8829eb

Please sign in to comment.