diff --git a/src/org/netpreserve/jwarc/FetchOptions.java b/src/org/netpreserve/jwarc/FetchOptions.java new file mode 100644 index 0000000..c8ffc3d --- /dev/null +++ b/src/org/netpreserve/jwarc/FetchOptions.java @@ -0,0 +1,69 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2023 National Library of Australia and the jwarc contributors + */ +package org.netpreserve.jwarc; + +import java.io.OutputStream; +import java.net.URI; + + +/** + * Options for fetching a remote resource. + * + * @see WarcWriter#fetch(URI, FetchOptions) + */ +public class FetchOptions { + long maxLength = 0; + long maxTime = 0; + int readTimeout = 60000; + String userAgent = "jwarc"; + OutputStream copyTo; + + /** + * Stops the fetch after this many bytes are received (including any protocol headers). If this limit was reached + * the header "WARC-Truncated: length" will be added to the response record. + */ + public FetchOptions maxLength(long bytes) { + this.maxLength = bytes; + return this; + } + + /** + * Stops the fetch after this many milliseconds have elapsed. If this limit was reached the header + * "WARC-Truncated: time" will be added to the response record. + */ + public FetchOptions maxTime(long millis) { + this.maxTime = millis; + return this; + } + + + /** + * Sets the read timeout in milliseconds on the socket. Defaults to 60000. Set to 0 for no timout. + * + * @see java.net.Socket#setSoTimeout(int) + */ + public FetchOptions readTimeout(int millis) { + this.readTimeout = millis; + return this; + } + + /** + * Sets the User-Agent request header. Default: "jwarc" + *
+ * If a custom HTTP request is provided this option will be ignored. + */ + public FetchOptions userAgent(String userAgent) { + this.userAgent = userAgent; + return this; + } + + /** + * If specified the response will also be copied to this OutputStream as well as the WARC file. + */ + public FetchOptions copyTo(OutputStream copyTo) { + this.copyTo = copyTo; + return this; + } +} \ No newline at end of file diff --git a/src/org/netpreserve/jwarc/MessageBody.java b/src/org/netpreserve/jwarc/MessageBody.java index 9268a29..05eb545 100644 --- a/src/org/netpreserve/jwarc/MessageBody.java +++ b/src/org/netpreserve/jwarc/MessageBody.java @@ -1,6 +1,5 @@ package org.netpreserve.jwarc; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; @@ -16,6 +15,10 @@ public static MessageBody empty() { return LengthedBody.EMPTY; } + /** + * Returns the length of the body. This may be less than the Content-Length header if the record was truncated. + * Returns -1 if the length cannot be determined (such as when chunked encoding is used). + */ public long size() throws IOException { return -1; } diff --git a/src/org/netpreserve/jwarc/WarcWriter.java b/src/org/netpreserve/jwarc/WarcWriter.java index 4ef8e71..405bb55 100644 --- a/src/org/netpreserve/jwarc/WarcWriter.java +++ b/src/org/netpreserve/jwarc/WarcWriter.java @@ -1,6 +1,6 @@ /* * SPDX-License-Identifier: Apache-2.0 - * Copyright (C) 2018 National Library of Australia and the jwarc contributors + * Copyright (C) 2018-2023 National Library of Australia and the jwarc contributors */ package org.netpreserve.jwarc; @@ -80,23 +80,46 @@ public synchronized void write(WarcRecord record) throws IOException { * Downloads a remote resource recording the request and response as WARC records. */ public FetchResult fetch(URI uri) throws IOException { + return fetch(uri, new FetchOptions()); + } + + /** + * Downloads a remote resource recording the request and response as WARC records. + *
+ * @param uri URL to download + * @param options fetch options to use + * @throws IOException if an IO error occurred + */ + public FetchResult fetch(URI uri, FetchOptions options) throws IOException { HttpRequest httpRequest = new HttpRequest.Builder("GET", uri) .version(MessageVersion.HTTP_1_0) // until we support chunked encoding - .addHeader("User-Agent", "jwarc") + .addHeader("User-Agent", options.userAgent) .addHeader("Connection", "close") .build(); - return fetch(uri, httpRequest, null); + return fetch(uri, httpRequest, options); } /** * Downloads a remote resource recording the request and response as WARC records. *
- * @param uri to download + * @param uri URL to download * @param httpRequest request to send * @param copyTo if not null will receive a copy of the (raw) http response bytes * @throws IOException if an IO error occurred */ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo) throws IOException { + return fetch(uri, httpRequest, new FetchOptions().copyTo(copyTo)); + } + + /** + * Downloads a remote resource recording the request and response as WARC records. + *
+ * @param uri URL to download
+ * @param httpRequest request to send
+ * @param options fetch options to use
+ * @throws IOException if an IO error occurred
+ */
+ public FetchResult fetch(URI uri, HttpRequest httpRequest, FetchOptions options) throws IOException {
Path tempPath = Files.createTempFile("jwarc", ".tmp");
try (FileChannel tempFile = FileChannel.open(tempPath, READ, WRITE, DELETE_ON_CLOSE, TRUNCATE_EXISTING)) {
byte[] httpRequestBytes = httpRequest.serializeHeader();
@@ -106,22 +129,41 @@ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo)
MessageDigest responseBlockDigest = MessageDigest.getInstance(digestAlgorithm);
InetAddress ip;
Instant date = Instant.now();
+ long startMillis = date.toEpochMilli();
+ WarcTruncationReason truncationReason = null;
try (Socket socket = IOUtils.connect(uri.getScheme(), uri.getHost(), uri.getPort())) {
socket.setTcpNoDelay(true);
+ socket.setSoTimeout(options.readTimeout);
ip = ((InetSocketAddress)socket.getRemoteSocketAddress()).getAddress();
socket.getOutputStream().write(httpRequestBytes);
InputStream inputStream = socket.getInputStream();
byte[] buf = new byte[8192];
+ long totalLength = 0;
while (true) {
- int n = inputStream.read(buf);
+ int len;
+ if (options.maxLength > 0 && options.maxLength - totalLength < buf.length) {
+ len = (int)(options.maxLength - totalLength);
+ } else {
+ len = buf.length;
+ }
+ int n = inputStream.read(buf, 0, len);
if (n < 0) break;
+ totalLength += n;
tempFile.write(ByteBuffer.wrap(buf, 0, n));
responseBlockDigest.update(buf, 0, n);
try {
- if (copyTo != null) copyTo.write(buf, 0, n);
+ if (options.copyTo != null) options.copyTo.write(buf, 0, n);
} catch (IOException e) {
// ignore
}
+ if (options.maxTime > 0 && System.currentTimeMillis() - startMillis > options.maxLength) {
+ truncationReason = WarcTruncationReason.TIME;
+ break;
+ }
+ if (options.maxLength > 0 && totalLength >= options.maxLength) {
+ truncationReason = WarcTruncationReason.LENGTH;
+ break;
+ }
}
}
@@ -137,6 +179,7 @@ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo)
if (responsePayloadDigest != null) {
responseBuilder.payloadDigest(new WarcDigest(responsePayloadDigest));
}
+ if (truncationReason != null) responseBuilder.truncated(truncationReason);
WarcResponse response = responseBuilder.build();
response.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
write(response);
diff --git a/src/org/netpreserve/jwarc/tools/FetchTool.java b/src/org/netpreserve/jwarc/tools/FetchTool.java
index badf44f..2cc7e8b 100644
--- a/src/org/netpreserve/jwarc/tools/FetchTool.java
+++ b/src/org/netpreserve/jwarc/tools/FetchTool.java
@@ -1,16 +1,60 @@
package org.netpreserve.jwarc.tools;
+import org.netpreserve.jwarc.FetchOptions;
import org.netpreserve.jwarc.WarcWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.List;
public class FetchTool {
public static void main(String[] args) throws IOException, URISyntaxException {
+ FetchOptions options = new FetchOptions();
+ List