From 251df6c33015a64f5b90bcb4fcd56830d0de054b Mon Sep 17 00:00:00 2001 From: "Hannes R. Brunsch" Date: Wed, 10 Jul 2024 14:04:35 +0200 Subject: [PATCH] docs: add comments in DocxLoader --- src/main/java/eu/snik/tag/DocxLoader.java | 52 +++++++++++++---------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/src/main/java/eu/snik/tag/DocxLoader.java b/src/main/java/eu/snik/tag/DocxLoader.java index 6093ea56..c166a579 100644 --- a/src/main/java/eu/snik/tag/DocxLoader.java +++ b/src/main/java/eu/snik/tag/DocxLoader.java @@ -23,9 +23,6 @@ /** Extracts SNIK classes from a tagged DOCX file. */ public class DocxLoader extends Loader { - // private static ObjectFactory factory = Context.getWmlObjectFactory(); - // private static int commentId = 10000; - /** * Creates a new instance for the DOCX loader to load one DOCX file * @param in Input stream for the DOCX file to load @@ -35,11 +32,18 @@ public DocxLoader(InputStream in) throws IOException { super(in); } - /** from https://stackoverflow.com/questions/19676282/docx4j-find-and-replace */ - static List getAllElementsFromObject(Object obj, Class... toSearch) { + /** + * Search for any occurences of Docx4J instances from any given node in the document tree. + * Originally from this blog post. + * @param obj object to search in + * @param toSearch Classes to search for + * @return List of all occurrences of instances of the given classes as self, children or transitive children of the given object. + */ + private static List getAllElementsFromObject(Object obj, Class... toSearch) { List result = new ArrayList(); if (obj instanceof JAXBElement) obj = ((JAXBElement) obj).getValue(); + // only add object to found after processing its children if (obj instanceof ContentAccessor) { List children = ((ContentAccessor) obj).getContent(); for (Object child : children) { @@ -54,7 +58,10 @@ static List getAllElementsFromObject(Object obj, Class... toSearch) { return result; } - /** @return the complete text from the DOCX file without any formatting */ + /** + * Get the entire unformatted textual content of the document. + * @return the complete text from the DOCX file without any formatting (except line breaks) + */ @Override public String getText() { try { @@ -63,7 +70,10 @@ public String getText() { var doc = wordMLPackage.getMainDocumentPart(); var parts = new ArrayList(); - List texts = getAllElementsFromObject(doc, org.docx4j.wml.Text.class, org.docx4j.wml.P.class); + // extract all text passages (including paragraph objects for information on line breaks) + List texts = DocxLoader.getAllElementsFromObject(doc, org.docx4j.wml.Text.class, org.docx4j.wml.P.class); + + // convert org.docx4j.wml.Text-s to Strings (interpret paragraphs as line breaks) for (Object t : texts) { if(t instanceof org.docx4j.wml.P) { parts.add("\n\n"); @@ -72,6 +82,8 @@ public String getText() { parts.add(content.getValue()); } } + + // put the parts together return parts .stream() .reduce( @@ -100,9 +112,15 @@ public String getText() { } } - record TagClass(String tag, String description, Subtop subtop) {} + /** + * Local type used for quickly identifying tagged tokens. + */ + private record TagClass(String tag, String description, Subtop subtop) {} - /** @return all classes extracted from the tagged parts of the DOCX document*/ + /** + * Extract all classes marked in the Docx document, without any duplicates. + * @return all classes extracted from the tagged parts of the DOCX document + */ @Override public Collection getClasses() { try { @@ -142,20 +160,9 @@ public Collection getClasses() { continue; } // abbreviations processedRuns.add(run); - - /* - Comment comment = factory.createCommentsComment(); - comments.add(comment); - comment.setId(BigInteger.valueOf(++commentId)); - Text commentText = factory.createText(); - commentText.setValue("this is a comment for "+label); - comment.getContent().add(commentText); - CommentReference commentRef = factory.createRCommentReference(); - run.getContent().add(commentRef); - commentRef.setId(BigInteger.valueOf(commentId)); - */ + + // remove multiply annotated tokens, then add the rest to processedLabels Clazz clazz = new Clazz(label, labelToLocalName(label), tc.subtop); - //System.out.println(text+" "+ clazz); if (processedLabels.contains(label)) { classes .stream() @@ -184,7 +191,6 @@ public Collection getClasses() { } } - //warningCallback.ifPresent(c->c.accept(warnings.stream().reduce("", (a,b)->a+"\n"+b))); System.out.println(classes.size() + " classes extracted."); return classes;