Skip to content

Commit

Permalink
Upgrade tika to 2.9.0 from 1.28.5
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Sep 18, 2023
1 parent 0d7f029 commit 1747374
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 41 deletions.
41 changes: 23 additions & 18 deletions ui/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
<properties>
<spring.version>3.1.1</spring.version>
<doss.version>1.7.4</doss.version>
<tika.version>2.9.0</tika.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -306,24 +307,28 @@

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<!-- 2.x causes unit tests to fail, probably need some refactoring to make it work -->
<version>1.28.5</version>
<exclusions>
<!-- prefer spring's newer version of jackson -->
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
</exclusions>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-html-module</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-microsoft-module</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-miscoffice-module</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-pdf-module</artifactId>
<version>${tika.version}</version>
</dependency>

<dependency>
Expand Down
10 changes: 0 additions & 10 deletions ui/resources/bamboo/task/tika.xml

This file was deleted.

7 changes: 0 additions & 7 deletions ui/src/bamboo/api/DataApiController.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@
import bamboo.crawl.Crawl;
import bamboo.crawl.Warc;
import bamboo.crawl.WarcsController;
import bamboo.task.TextCache;
import com.drew.metadata.mov.atoms.Atom;
import jakarta.servlet.http.HttpServletRequest;
import jakarta.servlet.http.HttpServletResponse;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.core.io.PathResource;
import org.springframework.core.io.UrlResource;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.stereotype.Controller;
Expand All @@ -22,11 +17,9 @@
import org.springframework.util.StringUtils;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.util.UriComponentsBuilder;
import org.springframework.web.util.UriUtils;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URI;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Base64;
Expand Down
12 changes: 6 additions & 6 deletions ui/src/bamboo/task/TextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.*;
import java.util.Arrays;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand All @@ -63,7 +63,7 @@ public TextExtractor() {
Files.copy(stream, logbackConfig, REPLACE_EXISTING);
}

TikaConfig config = new TikaConfig(getClass().getResource("tika.xml"));
TikaConfig config = new TikaConfig();
ForkParser parser = new ForkParser(getClass().getClassLoader(), new AutoDetectParser(config));
parser.setServerParseTimeoutMillis(15000); // don't spend too long on any one record
Path javaBinary = Path.of(System.getProperty("java.home"), "bin", "java");
Expand All @@ -74,7 +74,7 @@ public TextExtractor() {
this.parser = parser;
} catch (Exception e) {
close();
throw new RuntimeException("Error configuring tika via tika.xml", e);
throw new RuntimeException("Error configuring tika", e);
}
}

Expand Down Expand Up @@ -229,11 +229,11 @@ public void extractTika(InputStream record, Document doc, URI baseUrl) throws Te
doc.setText(clean(bodyHandler.toString()));
doc.setTitle(clean(getAny(metadata, TikaCoreProperties.TITLE.getName())));
doc.setDescription(clean(getAny(metadata, "description", "DC.description", "DC.Description", "dcterms.description")));
doc.setKeywords(clean(getAny(metadata, "Keywords", "keywords", "DC.keywords", "DC.Keywords", "dcterms.keywords")));
doc.setKeywords(clean(getAny(metadata, "Keywords", "keywords", "DC.keywords", "DC.Keywords", "dcterms.keywords", "pdf:docinfo:keywords")));
doc.setPublisher(clean(getAny(metadata, "publisher", "DC.publisher", "DC.Publisher", "dcterms.publisher")));
doc.setCreator(clean(getAny(metadata, "creator", "DC.creator", "DC.Creator", "dcterms.creator")));
doc.setContributor(clean(getAny(metadata, "contributor", "DC.contributor", "DC.Contributor", "dcterms.contributor")));
doc.setCoverage(clean(getAny(metadata, "coverage", "DC.coverage", "DC.Coverage", "dcterms.coverage", "subject", "Subject")));
doc.setCoverage(clean(getAny(metadata, "coverage", "DC.coverage", "DC.Coverage", "dcterms.coverage", "subject", "Subject", "pdf:docinfo:subject")));
doc.setH1(headingHandler.getHeadings());
doc.setOgSiteName(clean(metadata.get("og:site_name")));
doc.setOgTitle(clean(metadata.get("og:title")));
Expand Down

0 comments on commit 1747374

Please sign in to comment.