Skip to content

Commit

Permalink
Improve compatibility with winhttrack 3.48-22
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Apr 19, 2024
1 parent 1a59304 commit cdc87cc
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 7 deletions.
2 changes: 2 additions & 0 deletions src/au/gov/nla/httrack2warc/Httrack2Warc.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ public class Httrack2Warc {
"hts-cache/old.ndx",
"hts-cache/old.txt",
"hts-cache/old.zip",
"hts-cache/readme.txt",
"hts-cache/winprofile.ini",
"hts-err.txt",
"hts-ioinfo.txt",
"hts-log.txt",
Expand Down
2 changes: 1 addition & 1 deletion src/au/gov/nla/httrack2warc/httrack/HtsDoitParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import java.util.regex.Pattern;

class HtsDoitParser {
private static final Pattern DOIT_CMDLINE_RE = Pattern.compile(".*-O ?(?:\"([^\"]*)\"|([^ ]*)) .*");
private static final Pattern DOIT_CMDLINE_RE = Pattern.compile(".*-O1? ?(?:\"([^\"]*)\"|([^ ]*)) .*");
private static final Pattern DOIT_TS_RE = Pattern.compile("File generated automatically on (.*), do NOT edit");
static final DateTimeFormatter HTS_LOCAL_DATE = DateTimeFormatter.ofPattern("EEE, d MMM yyyy HH:mm:ss", Locale.US);

Expand Down
13 changes: 9 additions & 4 deletions src/au/gov/nla/httrack2warc/httrack/HtsLogParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class HtsLogParser implements Closeable {
private static final Pattern HEADER_RE = Pattern.compile("HTTrack(?<version>[^ ]+) launched on " +
"(?<date>\\w+, \\d\\d \\w+ \\d\\d\\d\\d \\d\\d:\\d\\d:\\d\\d) at " +
"(?<seedsAndFilters>.*)");
private static final Pattern CMDLINE_RE = Pattern.compile("\\(.*-O ?(?:\"([^\"]*)\"|([^ ]*)) .*");
private static final Pattern CMDLINE_RE = Pattern.compile("\\(.*-O1? ?(?:\"([^\"]*)\"|([^ ]*)) .*");

private final BufferedReader reader;
String version;
Expand Down Expand Up @@ -63,9 +63,14 @@ private void readHeader() throws IOException {

private void readCmdLine() throws IOException {
String line = reader.readLine();
if (line == null) {
return;
}
if (line == null) return;

// skip a blank line
// some logs produced by winhttrack seem to have lines ending with \r\r\n which java reads as extra blank line
if (line.isEmpty()) line = reader.readLine();
if (line == null) return;
if (line.length() < 3) return;

commandLine = line.substring(1, line.length() - 1).trim().split(" ", 2)[1];
Matcher matcher = CMDLINE_RE.matcher(line);
if (!matcher.matches()) {
Expand Down
19 changes: 19 additions & 0 deletions src/au/gov/nla/httrack2warc/httrack/HtsUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;

import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;

public class HtsUtil {
Expand All @@ -41,6 +42,24 @@ public static String fixupUrl(String raw) {
return url.toString();
}

/**
* Percent encode (mode 0)
*/
public static String percentEncode(String str) {
byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
for (byte b : bytes) {
if (b <= 31 || b == 127 || b == ' ' || b == '"') {
builder.append("%").append(String.format("%02x", b));
} else if (b == '\\') {
builder.append("/");
} else {
builder.append((char) b);
}
}
return builder.toString();
}

public static String stripProtocol(String url) {
return PROTOCOL.matcher(url).replaceFirst("");
}
Expand Down
4 changes: 2 additions & 2 deletions src/au/gov/nla/httrack2warc/httrack/HttrackCrawl.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ private void parseHtsLog() throws IOException {
try (HtsLogParser htsLog = new HtsLogParser(Files.newInputStream(dir.resolve(file)))) {
httrackVersion = htsLog.version;
launchTime = htsLog.launchTime;
outputDir = htsLog.outputDir;
outputDir = htsLog.outputDir == null ? null : HtsUtil.percentEncode(htsLog.outputDir);
httrackOptions = htsLog.commandLine;
} catch (NoSuchFileException e) {
// try next
Expand All @@ -109,7 +109,7 @@ private void parseDoitLog() throws IOException {
try (InputStream stream = Files.newInputStream(logFile)) {
HtsDoitParser doitLog = new HtsDoitParser(stream);
launchTime = doitLog.crawlStartTime;
outputDir = doitLog.outputDir;
outputDir = doitLog.outputDir == null ? null : HtsUtil.percentEncode(doitLog.outputDir);
httrackOptions = doitLog.commandLine;
}
}
Expand Down
12 changes: 12 additions & 0 deletions test/au/gov/nla/httrack2warc/httrack/HtsLogParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;

import static org.junit.Assert.assertEquals;
Expand All @@ -35,4 +37,14 @@ public void test() throws IOException {
assertEquals("http://www.industry.gov.au/acreagereleases/ar_home.html -O \"/pandas/working/13982/20030403\" -%HzZfI0A50000c6H1tx%xo0b1%sqZI0%I0%Hr50M1000000000E172800%PnK0L1p3Das0 -j -%A standard -%U pandora -#Z -#f -pandora.nla.gov.au* -www.nla.gov.au/pandora*", htsLog.commandLine);
}
}

@Test
public void testWinhttrack() throws IOException {
String data = "HTTrack3.48-22+htsswf+htsjava launched on Mon, 05 Sep 2016 09:24:43 at http://example.com/ +*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/* -mime:application/foobar\r\r\n" +
"(winhttrack -qwC2%Pxs0b0u1%s%uN0%I0p3DaK0H0%kf2A25000%f#f -F -%F \"<!-- Mirrored from %s%s by HTTrack Website Copier/3.x [XR&CO'2014], %s -->\" -%l \"en, *\" http://www.example.com/ -O1 \"C:\\My Web Sites\\Example Site\" +*.png +*.gif +*.jpg +*.jpeg +*.css +*.js -ad.doubleclick.net/* -mime:application/foobar )\r\r\n" +
"\r\r\n" +
"Information, Warnings and Errors reported for this mirror:\r\r\n";
HtsLogParser parser = new HtsLogParser(new ByteArrayInputStream(data.getBytes(StandardCharsets.US_ASCII)));
assertEquals("C:\\My Web Sites\\Example Site/", parser.outputDir);
}
}
9 changes: 9 additions & 0 deletions test/au/gov/nla/httrack2warc/httrack/HtsUtilTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package au.gov.nla.httrack2warc.httrack;

import junit.framework.TestCase;

public class HtsUtilTest extends TestCase {
public void testPercentEncode() {
assertEquals("a%20b%20%00%20%22c%22", HtsUtil.percentEncode("a b \0 \"c\""));
}
}

0 comments on commit cdc87cc

Please sign in to comment.