Skip to content

Commit

Permalink
Add fetch options: maxTime, maxLength, readTimeout, userAgent
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Jul 27, 2023
1 parent 77c6a4b commit daa1b4f
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 14 deletions.
69 changes: 69 additions & 0 deletions src/org/netpreserve/jwarc/FetchOptions.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2023 National Library of Australia and the jwarc contributors
*/
package org.netpreserve.jwarc;

import java.io.OutputStream;
import java.net.URI;


/**
* Options for fetching a remote resource.
*
* @see WarcWriter#fetch(URI, FetchOptions)
*/
public class FetchOptions {
long maxLength = 0;
long maxTime = 0;
int readTimeout = 60000;
String userAgent = "jwarc";
OutputStream copyTo;

/**
* Stops the fetch after this many bytes are received (including any protocol headers). If this limit was reached
* the header "WARC-Truncated: length" will be added to the response record.
*/
public FetchOptions maxLength(long bytes) {
this.maxLength = bytes;
return this;
}

/**
* Stops the fetch after this many milliseconds have elapsed. If this limit was reached the header
* "WARC-Truncated: time" will be added to the response record.
*/
public FetchOptions maxTime(long millis) {
this.maxTime = millis;
return this;
}


/**
* Sets the read timeout in milliseconds on the socket. Defaults to 60000. Set to 0 for no timout.
*
* @see java.net.Socket#setSoTimeout(int)
*/
public FetchOptions readTimeout(int millis) {
this.readTimeout = millis;
return this;
}

/**
* Sets the User-Agent request header. Default: "jwarc"
* <p>
* If a custom HTTP request is provided this option will be ignored.
*/
public FetchOptions userAgent(String userAgent) {
this.userAgent = userAgent;
return this;
}

/**
* If specified the response will also be copied to this OutputStream as well as the WARC file.
*/
public FetchOptions copyTo(OutputStream copyTo) {
this.copyTo = copyTo;
return this;
}
}
5 changes: 4 additions & 1 deletion src/org/netpreserve/jwarc/MessageBody.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.netpreserve.jwarc;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
Expand All @@ -16,6 +15,10 @@ public static MessageBody empty() {
return LengthedBody.EMPTY;
}

/**
* Returns the length of the body. This may be less than the Content-Length header if the record was truncated.
* Returns -1 if the length cannot be determined (such as when chunked encoding is used).
*/
public long size() throws IOException {
return -1;
}
Expand Down
55 changes: 49 additions & 6 deletions src/org/netpreserve/jwarc/WarcWriter.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2018 National Library of Australia and the jwarc contributors
* Copyright (C) 2018-2023 National Library of Australia and the jwarc contributors
*/

package org.netpreserve.jwarc;
Expand Down Expand Up @@ -80,23 +80,46 @@ public synchronized void write(WarcRecord record) throws IOException {
* Downloads a remote resource recording the request and response as WARC records.
*/
public FetchResult fetch(URI uri) throws IOException {
return fetch(uri, new FetchOptions());
}

/**
* Downloads a remote resource recording the request and response as WARC records.
* <p>
* @param uri URL to download
* @param options fetch options to use
* @throws IOException if an IO error occurred
*/
public FetchResult fetch(URI uri, FetchOptions options) throws IOException {
HttpRequest httpRequest = new HttpRequest.Builder("GET", uri)
.version(MessageVersion.HTTP_1_0) // until we support chunked encoding
.addHeader("User-Agent", "jwarc")
.addHeader("User-Agent", options.userAgent)
.addHeader("Connection", "close")
.build();
return fetch(uri, httpRequest, null);
return fetch(uri, httpRequest, options);
}

/**
* Downloads a remote resource recording the request and response as WARC records.
* <p>
* @param uri to download
* @param uri URL to download
* @param httpRequest request to send
* @param copyTo if not null will receive a copy of the (raw) http response bytes
* @throws IOException if an IO error occurred
*/
public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo) throws IOException {
return fetch(uri, httpRequest, new FetchOptions().copyTo(copyTo));
}

/**
* Downloads a remote resource recording the request and response as WARC records.
* <p>
* @param uri URL to download
* @param httpRequest request to send
* @param options fetch options to use
* @throws IOException if an IO error occurred
*/
public FetchResult fetch(URI uri, HttpRequest httpRequest, FetchOptions options) throws IOException {
Path tempPath = Files.createTempFile("jwarc", ".tmp");
try (FileChannel tempFile = FileChannel.open(tempPath, READ, WRITE, DELETE_ON_CLOSE, TRUNCATE_EXISTING)) {
byte[] httpRequestBytes = httpRequest.serializeHeader();
Expand All @@ -106,22 +129,41 @@ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo)
MessageDigest responseBlockDigest = MessageDigest.getInstance(digestAlgorithm);
InetAddress ip;
Instant date = Instant.now();
long startMillis = date.toEpochMilli();
WarcTruncationReason truncationReason = null;
try (Socket socket = IOUtils.connect(uri.getScheme(), uri.getHost(), uri.getPort())) {
socket.setTcpNoDelay(true);
socket.setSoTimeout(options.readTimeout);
ip = ((InetSocketAddress)socket.getRemoteSocketAddress()).getAddress();
socket.getOutputStream().write(httpRequestBytes);
InputStream inputStream = socket.getInputStream();
byte[] buf = new byte[8192];
long totalLength = 0;
while (true) {
int n = inputStream.read(buf);
int len;
if (options.maxLength > 0 && options.maxLength - totalLength < buf.length) {
len = (int)(options.maxLength - totalLength);
} else {
len = buf.length;
}
int n = inputStream.read(buf, 0, len);
if (n < 0) break;
totalLength += n;
tempFile.write(ByteBuffer.wrap(buf, 0, n));
responseBlockDigest.update(buf, 0, n);
try {
if (copyTo != null) copyTo.write(buf, 0, n);
if (options.copyTo != null) options.copyTo.write(buf, 0, n);
} catch (IOException e) {
// ignore
}
if (options.maxTime > 0 && System.currentTimeMillis() - startMillis > options.maxLength) {
truncationReason = WarcTruncationReason.TIME;
break;
}
if (options.maxLength > 0 && totalLength >= options.maxLength) {
truncationReason = WarcTruncationReason.LENGTH;
break;
}
}
}

Expand All @@ -137,6 +179,7 @@ public FetchResult fetch(URI uri, HttpRequest httpRequest, OutputStream copyTo)
if (responsePayloadDigest != null) {
responseBuilder.payloadDigest(new WarcDigest(responsePayloadDigest));
}
if (truncationReason != null) responseBuilder.truncated(truncationReason);
WarcResponse response = responseBuilder.build();
response.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
write(response);
Expand Down
48 changes: 46 additions & 2 deletions src/org/netpreserve/jwarc/tools/FetchTool.java
Original file line number Diff line number Diff line change
@@ -1,16 +1,60 @@
package org.netpreserve.jwarc.tools;

import org.netpreserve.jwarc.FetchOptions;
import org.netpreserve.jwarc.WarcWriter;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

public class FetchTool {
public static void main(String[] args) throws IOException, URISyntaxException {
FetchOptions options = new FetchOptions();
List<URI> urls = new ArrayList<>();
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "-h":
case "--help":
System.out.println("Usage: jwarc fetch [options] url...");
System.out.println("Fetches a URL while writing the request and response as WARC records");
System.out.println();
System.out.println("Options:");
System.out.println("-A, --user-agent STRING Sets the User-Agent header");
System.out.println(" --read-timeout MILLIS Sets the socket read timeout");
System.out.println(" --max-length BYTES Truncate response after BYTES received");
System.out.println(" --max-time MILLIS Truncate response after MILLIS elapsed");
System.exit(0);
break;
case "-A":
case "--user-agent":
options.userAgent(args[++i]);
break;
case "--read-timeout":
options.readTimeout(Integer.parseInt(args[++i]));
break;
case "--max-length":
options.maxLength(Integer.parseInt(args[++i]));
break;
case "--max-time":
options.maxTime(Integer.parseInt(args[++i]));
break;
default:
if (args[i].startsWith("-")) {
System.err.println("Unknown option: " + args[i]);
System.exit(1);
}
urls.add(new URI(args[i]));
}
}
if (urls.isEmpty()) {
System.err.println("No URLs specified. Try: jwarc fetch --help");
System.exit(1);
}
try (WarcWriter writer = new WarcWriter(System.out)) {
for (String arg : args) {
writer.fetch(new URI(arg));
for (URI url : urls) {
writer.fetch(url, options);
}
}
}
Expand Down
23 changes: 18 additions & 5 deletions test/org/netpreserve/jwarc/apitests/WarcWriterTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2020 National Library of Australia and the jwarc contributors
* Copyright (C) 2020-2023 National Library of Australia and the jwarc contributors
*/

package org.netpreserve.jwarc.apitests;
Expand Down Expand Up @@ -32,9 +32,12 @@ public class WarcWriterTest {

@Test
public void fetch() throws IOException, NoSuchAlgorithmException, URISyntaxException {
byte[] body = "Hello world!".getBytes(StandardCharsets.UTF_8);
MessageDigest bodyDigest = MessageDigest.getInstance("SHA-1");
bodyDigest.update(body);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] message = "Hello world!\n".getBytes(StandardCharsets.UTF_8);
while (baos.size() < 4096) {
baos.write(message);
}
byte[] body = baos.toByteArray();

// get loopback address
HttpServer server = HttpServer.create(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0), 0);
Expand All @@ -51,7 +54,8 @@ public void fetch() throws IOException, NoSuchAlgorithmException, URISyntaxExcep
WarcWriter warcWriter = new WarcWriter(Channels.newChannel(out));
URI uri = new URI("http", null, server.getAddress().getHostString(),
server.getAddress().getPort(), "/", null, null);
FetchResult result = warcWriter.fetch(uri);
int maxLength = 512;
FetchResult result = warcWriter.fetch(uri, new FetchOptions().maxLength(maxLength));

assertEquals(256, result.response().http().status());
assertEquals("/", result.request().http().target());
Expand All @@ -61,10 +65,19 @@ public void fetch() throws IOException, NoSuchAlgorithmException, URISyntaxExcep

WarcResponse response = (WarcResponse) warcReader.next()
.orElseThrow(() -> new RuntimeException("Missing response record"));
System.out.println(new String(response.serializeHeader()));
//assertEquals(12, response.http().body().size());
assertEquals(256, response.http().status());
assertEquals("present", response.http().headers().first("Test-Header").orElse(null));
assertTrue(response.blockDigest().isPresent());
assertEquals(response.calculatedBlockDigest(), response.blockDigest());

assertEquals(WarcTruncationReason.LENGTH, response.truncated());
assertEquals(Optional.of(maxLength), response.headers().sole("Content-Length").map(Integer::parseInt));
assertEquals(maxLength, response.body().size());
MessageDigest bodyDigest = MessageDigest.getInstance("SHA-1");
long payloadSize = response.http().body().size();
bodyDigest.update(body, 0, (int) payloadSize);
assertEquals(new WarcDigest(bodyDigest).toString(), response.payloadDigest().map(Object::toString).orElse(null));

WarcRequest request = (WarcRequest) warcReader.next()
Expand Down

0 comments on commit daa1b4f

Please sign in to comment.