diff --git a/src/org/netpreserve/jwarc/FetchOptions.java b/src/org/netpreserve/jwarc/FetchOptions.java new file mode 100644 index 0000000..c8ffc3d --- /dev/null +++ b/src/org/netpreserve/jwarc/FetchOptions.java @@ -0,0 +1,69 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2023 National Library of Australia and the jwarc contributors + */ +package org.netpreserve.jwarc; + +import java.io.OutputStream; +import java.net.URI; + + +/** + * Options for fetching a remote resource. + * + * @see WarcWriter#fetch(URI, FetchOptions) + */ +public class FetchOptions { + long maxLength = 0; + long maxTime = 0; + int readTimeout = 60000; + String userAgent = "jwarc"; + OutputStream copyTo; + + /** + * Stops the fetch after this many bytes are received (including any protocol headers). If this limit was reached + * the header "WARC-Truncated: length" will be added to the response record. + */ + public FetchOptions maxLength(long bytes) { + this.maxLength = bytes; + return this; + } + + /** + * Stops the fetch after this many milliseconds have elapsed. If this limit was reached the header + * "WARC-Truncated: time" will be added to the response record. + */ + public FetchOptions maxTime(long millis) { + this.maxTime = millis; + return this; + } + + + /** + * Sets the read timeout in milliseconds on the socket. Defaults to 60000. Set to 0 for no timout. + * + * @see java.net.Socket#setSoTimeout(int) + */ + public FetchOptions readTimeout(int millis) { + this.readTimeout = millis; + return this; + } + + /** + * Sets the User-Agent request header. Default: "jwarc" + *

+ * If a custom HTTP request is provided this option will be ignored. + */ + public FetchOptions userAgent(String userAgent) { + this.userAgent = userAgent; + return this; + } + + /** + * If specified the response will also be copied to this OutputStream as well as the WARC file. + */ + public FetchOptions copyTo(OutputStream copyTo) { + this.copyTo = copyTo; + return this; + } +} \ No newline at end of file diff --git a/src/org/netpreserve/jwarc/MessageBody.java b/src/org/netpreserve/jwarc/MessageBody.java index 9268a29..05eb545 100644 --- a/src/org/netpreserve/jwarc/MessageBody.java +++ b/src/org/netpreserve/jwarc/MessageBody.java @@ -1,6 +1,5 @@ package org.netpreserve.jwarc; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; @@ -16,6 +15,10 @@ public static MessageBody empty() { return LengthedBody.EMPTY; } + /** + * Returns the length of the body. This may be less than the Content-Length header if the record was truncated. + * Returns -1 if the length cannot be determined (such as when chunked encoding is used). + */ public long size() throws IOException { return -1; } diff --git a/src/org/netpreserve/jwarc/WarcWriter.java b/src/org/netpreserve/jwarc/WarcWriter.java index 4ef8e71..405bb55 100644 --- a/src/org/netpreserve/jwarc/WarcWriter.java +++ b/src/org/netpreserve/jwarc/WarcWriter.java @@ -1,6 +1,6 @@ /* * SPDX-License-Identifier: Apache-2.0 - * Copyright (C) 2018 National Library of Australia and the jwarc contributors + * Copyright (C) 2018-2023 National Library of Australia and the jwarc contributors */ package org.netpreserve.jwarc; @@ -80,23 +80,46 @@ public synchronized void write(WarcRecord record) throws IOException { * Downloads a remote resource recording the request and response as WARC records. */ public FetchResult fetch(URI uri) throws IOException { + return fetch(uri, new FetchOptions()); + } + + /** + * Downloads a remote resource recording the request and response as WARC records. + *

+ * @param uri URL to download + * @param options fetch options to use + * @throws IOException if an IO error occurred + */ + public FetchResult fetch(URI uri, FetchOptions options) throws IOException { HttpRequest httpRequest = new HttpRequest.Builder("GET", uri) .version(MessageVersion.HTTP_1_0) // until we support chunked encoding - .addHeader("User-Agent", "jwarc") + .addHeader("User-Agent", options.userAgent) .addHeader("Connection", "close") .build(); - return fetch(uri, httpRequest, null); + return fetch(uri, httpRequest, options); } /** * Downloads a remote resource recording the request and response as WARC records. *

- * @param uri to download + * @param uri URL to download * @param httpRequest request to send * @param copyTo if not null will receive a copy of the (raw) http response bytes * @throws IOException if an IO error occurred */ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo) throws IOException { + return fetch(uri, httpRequest, new FetchOptions().copyTo(copyTo)); + } + + /** + * Downloads a remote resource recording the request and response as WARC records. + *

+ * @param uri URL to download + * @param httpRequest request to send + * @param options fetch options to use + * @throws IOException if an IO error occurred + */ + public FetchResult fetch(URI uri, HttpRequest httpRequest, FetchOptions options) throws IOException { Path tempPath = Files.createTempFile("jwarc", ".tmp"); try (FileChannel tempFile = FileChannel.open(tempPath, READ, WRITE, DELETE_ON_CLOSE, TRUNCATE_EXISTING)) { byte[] httpRequestBytes = httpRequest.serializeHeader(); @@ -106,22 +129,41 @@ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo) MessageDigest responseBlockDigest = MessageDigest.getInstance(digestAlgorithm); InetAddress ip; Instant date = Instant.now(); + long startMillis = date.toEpochMilli(); + WarcTruncationReason truncationReason = null; try (Socket socket = IOUtils.connect(uri.getScheme(), uri.getHost(), uri.getPort())) { socket.setTcpNoDelay(true); + socket.setSoTimeout(options.readTimeout); ip = ((InetSocketAddress)socket.getRemoteSocketAddress()).getAddress(); socket.getOutputStream().write(httpRequestBytes); InputStream inputStream = socket.getInputStream(); byte[] buf = new byte[8192]; + long totalLength = 0; while (true) { - int n = inputStream.read(buf); + int len; + if (options.maxLength > 0 && options.maxLength - totalLength < buf.length) { + len = (int)(options.maxLength - totalLength); + } else { + len = buf.length; + } + int n = inputStream.read(buf, 0, len); if (n < 0) break; + totalLength += n; tempFile.write(ByteBuffer.wrap(buf, 0, n)); responseBlockDigest.update(buf, 0, n); try { - if (copyTo != null) copyTo.write(buf, 0, n); + if (options.copyTo != null) options.copyTo.write(buf, 0, n); } catch (IOException e) { // ignore } + if (options.maxTime > 0 && System.currentTimeMillis() - startMillis > options.maxLength) { + truncationReason = WarcTruncationReason.TIME; + break; + } + if (options.maxLength > 0 && totalLength >= options.maxLength) { + truncationReason = WarcTruncationReason.LENGTH; + break; + } } } @@ -137,6 +179,7 @@ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo) if (responsePayloadDigest != null) { responseBuilder.payloadDigest(new WarcDigest(responsePayloadDigest)); } + if (truncationReason != null) responseBuilder.truncated(truncationReason); WarcResponse response = responseBuilder.build(); response.http(); // force HTTP header to be parsed before body is consumed so that caller can use it write(response); diff --git a/src/org/netpreserve/jwarc/tools/FetchTool.java b/src/org/netpreserve/jwarc/tools/FetchTool.java index badf44f..2cc7e8b 100644 --- a/src/org/netpreserve/jwarc/tools/FetchTool.java +++ b/src/org/netpreserve/jwarc/tools/FetchTool.java @@ -1,16 +1,60 @@ package org.netpreserve.jwarc.tools; +import org.netpreserve.jwarc.FetchOptions; import org.netpreserve.jwarc.WarcWriter; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; public class FetchTool { public static void main(String[] args) throws IOException, URISyntaxException { + FetchOptions options = new FetchOptions(); + List urls = new ArrayList<>(); + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "-h": + case "--help": + System.out.println("Usage: jwarc fetch [options] url..."); + System.out.println("Fetches a URL while writing the request and response as WARC records"); + System.out.println(); + System.out.println("Options:"); + System.out.println("-A, --user-agent STRING Sets the User-Agent header"); + System.out.println(" --read-timeout MILLIS Sets the socket read timeout"); + System.out.println(" --max-length BYTES Truncate response after BYTES received"); + System.out.println(" --max-time MILLIS Truncate response after MILLIS elapsed"); + System.exit(0); + break; + case "-A": + case "--user-agent": + options.userAgent(args[++i]); + break; + case "--read-timeout": + options.readTimeout(Integer.parseInt(args[++i])); + break; + case "--max-length": + options.maxLength(Integer.parseInt(args[++i])); + break; + case "--max-time": + options.maxTime(Integer.parseInt(args[++i])); + break; + default: + if (args[i].startsWith("-")) { + System.err.println("Unknown option: " + args[i]); + System.exit(1); + } + urls.add(new URI(args[i])); + } + } + if (urls.isEmpty()) { + System.err.println("No URLs specified. Try: jwarc fetch --help"); + System.exit(1); + } try (WarcWriter writer = new WarcWriter(System.out)) { - for (String arg : args) { - writer.fetch(new URI(arg)); + for (URI url : urls) { + writer.fetch(url, options); } } } diff --git a/test/org/netpreserve/jwarc/apitests/WarcWriterTest.java b/test/org/netpreserve/jwarc/apitests/WarcWriterTest.java index 079fc53..4a5b0c0 100644 --- a/test/org/netpreserve/jwarc/apitests/WarcWriterTest.java +++ b/test/org/netpreserve/jwarc/apitests/WarcWriterTest.java @@ -1,6 +1,6 @@ /* * SPDX-License-Identifier: Apache-2.0 - * Copyright (C) 2020 National Library of Australia and the jwarc contributors + * Copyright (C) 2020-2023 National Library of Australia and the jwarc contributors */ package org.netpreserve.jwarc.apitests; @@ -32,9 +32,12 @@ public class WarcWriterTest { @Test public void fetch() throws IOException, NoSuchAlgorithmException, URISyntaxException { - byte[] body = "Hello world!".getBytes(StandardCharsets.UTF_8); - MessageDigest bodyDigest = MessageDigest.getInstance("SHA-1"); - bodyDigest.update(body); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] message = "Hello world!\n".getBytes(StandardCharsets.UTF_8); + while (baos.size() < 4096) { + baos.write(message); + } + byte[] body = baos.toByteArray(); // get loopback address HttpServer server = HttpServer.create(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0), 0); @@ -51,7 +54,8 @@ public void fetch() throws IOException, NoSuchAlgorithmException, URISyntaxExcep WarcWriter warcWriter = new WarcWriter(Channels.newChannel(out)); URI uri = new URI("http", null, server.getAddress().getHostString(), server.getAddress().getPort(), "/", null, null); - FetchResult result = warcWriter.fetch(uri); + int maxLength = 512; + FetchResult result = warcWriter.fetch(uri, new FetchOptions().maxLength(maxLength)); assertEquals(256, result.response().http().status()); assertEquals("/", result.request().http().target()); @@ -61,10 +65,19 @@ public void fetch() throws IOException, NoSuchAlgorithmException, URISyntaxExcep WarcResponse response = (WarcResponse) warcReader.next() .orElseThrow(() -> new RuntimeException("Missing response record")); + System.out.println(new String(response.serializeHeader())); + //assertEquals(12, response.http().body().size()); assertEquals(256, response.http().status()); assertEquals("present", response.http().headers().first("Test-Header").orElse(null)); assertTrue(response.blockDigest().isPresent()); assertEquals(response.calculatedBlockDigest(), response.blockDigest()); + + assertEquals(WarcTruncationReason.LENGTH, response.truncated()); + assertEquals(Optional.of(maxLength), response.headers().sole("Content-Length").map(Integer::parseInt)); + assertEquals(maxLength, response.body().size()); + MessageDigest bodyDigest = MessageDigest.getInstance("SHA-1"); + long payloadSize = response.http().body().size(); + bodyDigest.update(body, 0, (int) payloadSize); assertEquals(new WarcDigest(bodyDigest).toString(), response.payloadDigest().map(Object::toString).orElse(null)); WarcRequest request = (WarcRequest) warcReader.next()