Skip to content

Commit

Permalink
ARC parser: Don't add MIME field to buffer
Browse files Browse the repository at this point in the history
We're ignoring it anyway and in the case of a bogus MIME entry it could get prepended to the length field.
  • Loading branch information
ato committed Jul 26, 2023
1 parent 0561764 commit 62dffb1
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 88 deletions.
152 changes: 72 additions & 80 deletions src/org/netpreserve/jwarc/WarcParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import static java.nio.charset.StandardCharsets.US_ASCII;


// line 147 "WarcParser.rl"
// line 142 "WarcParser.rl"


/**
Expand Down Expand Up @@ -243,30 +243,23 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] )
case 10:
// line 80 "WarcParser.rl"
{
// TODO
setHeader("Content-Length", new String(buf, 0, bufPos, US_ASCII));
bufPos = 0;
}
break;
case 11:
// line 85 "WarcParser.rl"
{
setHeader("Content-Length", new String(buf, 0, bufPos, US_ASCII));
bufPos = 0;
}
break;
case 12:
// line 90 "WarcParser.rl"
{
protocol = "ARC";
major = 1;
minor = 1;
}
break;
case 13:
// line 145 "WarcParser.rl"
case 12:
// line 140 "WarcParser.rl"
{ { p += 1; _goto_targ = 5; if (true) continue _goto;} }
break;
// line 270 "WarcParser.java"
// line 263 "WarcParser.java"
}
}
}
Expand All @@ -286,7 +279,7 @@ else if ( ( (data.get(p) & 0xff)) > _warc_trans_keys[_mid+1] )
break; }
}

// line 209 "WarcParser.rl"
// line 204 "WarcParser.rl"

position += p - data.position();
data.position(p);
Expand Down Expand Up @@ -340,14 +333,13 @@ private void setHeader(String name, String value) {
}


// line 344 "WarcParser.java"
// line 337 "WarcParser.java"
private static byte[] init__warc_actions_0()
{
return new byte [] {
0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1,
5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 1,
13, 2, 0, 10, 2, 3, 0, 2, 4, 0, 2, 6,
0, 3, 11, 12, 13
5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 12, 2,
3, 0, 2, 4, 0, 2, 6, 0, 3, 10, 11, 12
};
}

Expand All @@ -362,9 +354,9 @@ private static short[] init__warc_key_offsets_0()
104, 106, 109, 111, 114, 116, 119, 121, 123, 125, 127, 129,
131, 133, 135, 137, 139, 141, 143, 145, 147, 148, 165, 167,
169, 172, 188, 205, 224, 228, 233, 236, 253, 269, 284, 302,
309, 312, 316, 334, 351, 368, 386, 403, 412, 423, 435, 439,
445, 448, 449, 452, 453, 456, 457, 460, 461, 477, 478, 494,
500, 501, 519, 525, 531, 537, 537
309, 312, 316, 334, 351, 368, 386, 403, 412, 423, 435, 441,
444, 445, 448, 449, 452, 453, 456, 457, 473, 474, 490, 496,
497, 515, 521, 527, 533, 533
};
}

Expand Down Expand Up @@ -410,15 +402,15 @@ private static char[] init__warc_trans_keys_0()
46, 48, 57, 65, 90, 94, 122, 9, 10, 32, 34, 92,
33, 126, 128, 255, 9, 34, 92, 32, 47, 48, 57, 58,
126, 128, 255, 9, 10, 34, 92, 32, 47, 48, 57, 58,
126, 128, 255, 9, 10, 32, 59, 10, 32, 0, 191, 194,
244, 32, 48, 57, 32, 46, 48, 57, 46, 46, 48, 57,
46, 46, 48, 57, 46, 13, 33, 124, 126, 35, 39, 42,
43, 45, 46, 48, 57, 65, 90, 94, 122, 10, 33, 58,
124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65, 90,
94, 122, 9, 13, 32, 127, 0, 31, 10, 9, 13, 32,
33, 124, 126, 35, 39, 42, 43, 45, 46, 48, 57, 65,
90, 94, 122, 9, 13, 32, 127, 0, 31, 9, 13, 32,
127, 0, 31, 9, 13, 32, 127, 0, 31, 0
126, 128, 255, 10, 32, 0, 191, 194, 244, 32, 48, 57,
32, 46, 48, 57, 46, 46, 48, 57, 46, 46, 48, 57,
46, 13, 33, 124, 126, 35, 39, 42, 43, 45, 46, 48,
57, 65, 90, 94, 122, 10, 33, 58, 124, 126, 35, 39,
42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 9, 13,
32, 127, 0, 31, 10, 9, 13, 32, 33, 124, 126, 35,
39, 42, 43, 45, 46, 48, 57, 65, 90, 94, 122, 9,
13, 32, 127, 0, 31, 9, 13, 32, 127, 0, 31, 9,
13, 32, 127, 0, 31, 0
};
}

Expand All @@ -433,9 +425,9 @@ private static byte[] init__warc_single_lengths_0()
0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 2, 0,
1, 6, 5, 7, 4, 3, 3, 5, 4, 3, 6, 3,
3, 0, 6, 5, 5, 6, 5, 5, 3, 4, 4, 2,
1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 4,
1, 6, 4, 4, 4, 0, 0
3, 0, 6, 5, 5, 6, 5, 5, 3, 4, 2, 1,
1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 4, 1,
6, 4, 4, 4, 0, 0
};
}

Expand All @@ -450,9 +442,9 @@ private static byte[] init__warc_range_lengths_0()
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 0, 1,
1, 5, 6, 6, 0, 1, 0, 6, 6, 6, 6, 2,
0, 2, 6, 6, 6, 6, 6, 2, 4, 4, 0, 2,
1, 0, 1, 0, 1, 0, 1, 0, 6, 0, 6, 1,
0, 6, 1, 1, 1, 0, 0
0, 2, 6, 6, 6, 6, 6, 2, 4, 4, 2, 1,
0, 1, 0, 1, 0, 1, 0, 6, 0, 6, 1, 0,
6, 1, 1, 1, 0, 0
};
}

Expand All @@ -468,8 +460,8 @@ private static short[] init__warc_index_offsets_0()
124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 154, 157,
159, 162, 174, 186, 200, 205, 210, 214, 226, 237, 247, 260,
266, 270, 273, 286, 298, 310, 323, 335, 343, 351, 360, 365,
370, 373, 375, 378, 380, 383, 385, 388, 390, 401, 403, 414,
420, 422, 435, 441, 447, 453, 454
368, 370, 373, 375, 378, 380, 383, 385, 396, 398, 409, 415,
417, 430, 436, 442, 448, 449
};
}

Expand All @@ -491,32 +483,32 @@ private static byte[] init__warc_indicies_0()
1, 40, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1,
46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51, 1,
52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 1, 58,
59, 59, 59, 59, 59, 59, 59, 59, 59, 57, 1, 60,
57, 61, 1, 62, 61, 1, 1, 58, 59, 63, 59, 59,
59, 59, 59, 59, 59, 57, 1, 60, 64, 64, 64, 64,
64, 64, 64, 64, 64, 57, 65, 1, 66, 64, 67, 64,
64, 64, 64, 64, 64, 64, 64, 57, 65, 1, 68, 67,
57, 69, 69, 70, 61, 1, 69, 69, 70, 1, 70, 70,
71, 71, 71, 71, 71, 71, 71, 71, 71, 1, 71, 72,
71, 71, 71, 71, 71, 71, 71, 71, 1, 74, 73, 73,
73, 73, 73, 73, 73, 73, 1, 69, 66, 73, 70, 73,
73, 73, 73, 73, 73, 73, 73, 1, 74, 75, 76, 74,
74, 1, 69, 66, 70, 1, 74, 74, 1, 67, 1, 77,
78, 78, 78, 78, 78, 78, 78, 78, 78, 57, 70, 70,
71, 71, 71, 71, 71, 71, 79, 71, 71, 1, 62, 71,
72, 71, 71, 71, 71, 71, 79, 71, 71, 1, 1, 60,
78, 80, 78, 78, 78, 78, 78, 78, 78, 78, 57, 1,
60, 81, 64, 64, 64, 64, 64, 64, 64, 64, 57, 81,
1, 82, 83, 84, 81, 81, 57, 74, 75, 76, 74, 85,
74, 74, 1, 74, 62, 75, 76, 74, 85, 74, 74, 1,
65, 1, 66, 67, 57, 74, 82, 81, 81, 57, 40, 86,
1, 40, 1, 37, 87, 1, 37, 1, 34, 88, 1, 34,
1, 31, 89, 1, 31, 1, 90, 91, 91, 91, 91, 91,
91, 91, 91, 91, 1, 92, 1, 91, 93, 91, 91, 91,
91, 91, 91, 91, 91, 1, 94, 95, 94, 1, 1, 96,
97, 1, 98, 99, 98, 100, 100, 100, 100, 100, 100, 100,
100, 100, 1, 98, 101, 98, 1, 1, 102, 103, 104, 103,
1, 1, 96, 105, 95, 105, 1, 1, 96, 1, 1, 0
59, 59, 59, 59, 59, 59, 59, 59, 59, 57, 1, 58,
57, 60, 1, 61, 60, 1, 1, 58, 59, 62, 59, 59,
59, 59, 59, 59, 59, 57, 1, 58, 63, 63, 63, 63,
63, 63, 63, 63, 63, 57, 64, 1, 65, 63, 66, 63,
63, 63, 63, 63, 63, 63, 63, 57, 64, 1, 65, 66,
57, 67, 67, 68, 60, 1, 67, 67, 68, 1, 68, 68,
69, 69, 69, 69, 69, 69, 69, 69, 69, 1, 69, 70,
69, 69, 69, 69, 69, 69, 69, 69, 1, 72, 71, 71,
71, 71, 71, 71, 71, 71, 1, 67, 65, 71, 68, 71,
71, 71, 71, 71, 71, 71, 71, 1, 72, 73, 74, 72,
72, 1, 67, 65, 68, 1, 72, 72, 1, 66, 1, 75,
76, 76, 76, 76, 76, 76, 76, 76, 76, 57, 68, 68,
69, 69, 69, 69, 69, 69, 77, 69, 69, 1, 61, 69,
70, 69, 69, 69, 69, 69, 77, 69, 69, 1, 1, 58,
76, 78, 76, 76, 76, 76, 76, 76, 76, 76, 57, 1,
58, 79, 63, 63, 63, 63, 63, 63, 63, 63, 57, 79,
1, 80, 64, 81, 79, 79, 57, 72, 73, 74, 72, 82,
72, 72, 1, 72, 61, 73, 74, 72, 82, 72, 72, 1,
72, 80, 79, 79, 57, 40, 83, 1, 40, 1, 37, 84,
1, 37, 1, 34, 85, 1, 34, 1, 31, 86, 1, 31,
1, 87, 88, 88, 88, 88, 88, 88, 88, 88, 88, 1,
89, 1, 88, 90, 88, 88, 88, 88, 88, 88, 88, 88,
1, 91, 92, 91, 1, 1, 93, 94, 1, 95, 96, 95,
97, 97, 97, 97, 97, 97, 97, 97, 97, 1, 95, 98,
95, 1, 1, 99, 100, 101, 100, 1, 1, 93, 102, 92,
102, 1, 1, 93, 1, 1, 0
};
}

Expand All @@ -527,14 +519,14 @@ private static byte[] init__warc_trans_targs_0()
{
return new byte [] {
2, 0, 20, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 89, 14, 14, 15, 18, 16, 17, 12, 13, 15,
18, 19, 15, 19, 21, 22, 23, 24, 78, 25, 26, 76,
27, 28, 74, 29, 30, 72, 31, 32, 33, 34, 35, 36,
12, 13, 88, 14, 14, 15, 18, 16, 17, 12, 13, 15,
18, 19, 15, 19, 21, 22, 23, 24, 77, 25, 26, 75,
27, 28, 73, 29, 30, 71, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49,
47, 48, 89, 50, 51, 52, 53, 62, 53, 54, 55, 56,
57, 58, 59, 60, 61, 63, 65, 64, 66, 67, 68, 70,
71, 69, 73, 75, 77, 79, 81, 82, 90, 83, 83, 84,
87, 85, 86, 81, 82, 84, 87, 88, 84, 88
48, 88, 50, 51, 52, 53, 62, 54, 55, 56, 57, 58,
59, 60, 61, 63, 65, 64, 66, 67, 68, 70, 69, 72,
74, 76, 78, 80, 81, 89, 82, 82, 83, 86, 84, 85,
80, 81, 83, 86, 87, 83, 87
};
}

Expand All @@ -545,27 +537,27 @@ private static byte[] init__warc_trans_actions_0()
{
return new byte [] {
0, 0, 1, 0, 0, 0, 0, 3, 0, 5, 0, 0,
0, 1, 23, 11, 0, 0, 1, 0, 0, 13, 34, 9,
31, 28, 7, 1, 1, 15, 1, 1, 1, 1, 1, 1,
0, 1, 21, 11, 0, 0, 1, 0, 0, 13, 29, 9,
26, 23, 7, 1, 1, 15, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 17, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 19, 0, 21, 1,
0, 1, 37, 1, 1, 1, 25, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 1, 0, 11, 0, 0,
1, 0, 0, 13, 34, 9, 31, 28, 7, 1
1, 1, 1, 1, 1, 1, 1, 1, 19, 0, 0, 0,
1, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
1, 1, 1, 0, 1, 0, 11, 0, 0, 1, 0, 0,
13, 29, 9, 26, 23, 7, 1
};
}

private static final byte _warc_trans_actions[] = init__warc_trans_actions_0();


static final int warc_start = 1;
static final int warc_first_final = 89;
static final int warc_first_final = 88;
static final int warc_error = 0;

static final int warc_en_warc_fields = 80;
static final int warc_en_warc_fields = 79;
static final int warc_en_any_header = 1;


// line 262 "WarcParser.rl"
// line 257 "WarcParser.rl"
}
7 changes: 1 addition & 6 deletions src/org/netpreserve/jwarc/WarcParser.rl
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,6 @@ action handle_arc_date {
bufPos = 0;
}

action handle_arc_mime {
// TODO
bufPos = 0;
}

action handle_arc_length {
setHeader("Content-Length", new String(buf, 0, bufPos, US_ASCII));
bufPos = 0;
Expand Down Expand Up @@ -136,7 +131,7 @@ arc_url_byte = any - "\n" - " ";
arc_url = (lower+ ":" arc_url_byte*) $push %handle_arc_url;
arc_ip = (digit{1,3} "." digit{1,3} "." digit{1,3} "." digit{1,3}) $push %handle_arc_ip;
arc_date = digit{14} $push %handle_arc_date;
arc_mime = (token ("/" token ( OWS ";" OWS parameter )*)?)? $push %handle_arc_mime;
arc_mime = (token ("/" token ( OWS ";" OWS parameter )*)?)?;
arc_mime_lenient = arc_mime | (any - " " - "\n")*;
arc_length = digit+ $push %handle_arc_length %handle_arc;
arc_header = arc_url " " arc_ip " " arc_date " " arc_mime_lenient " " arc_length "\n";
Expand Down
14 changes: 12 additions & 2 deletions test/org/netpreserve/jwarc/WarcParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,21 @@
public class WarcParserTest {
@Test
public void testParsingArcWithBogusMime() {
String input = "http://example.com/ 1.2.3.4 20110104111607 @[=*�Content-Type] 494\n";
WarcParser parser = parse("http://example.com/ 1.2.3.4 20110104111607 @[=*�Content-Type] 494\n");
assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
parser = parse("http://example.com/ 1.2.3.4 20110104111607 charset=foo 494\n");
assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
parser = parse("http://example.com/ 1.2.3.4 20110104111607 image(jpeg) 494\n");
assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
parser = parse("http://example.com/ 1.2.3.4 20110104111607 ERROR: 494\n");
assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
}

private static WarcParser parse(String input) {
WarcParser parser = new WarcParser();
parser.parse(ByteBuffer.wrap(input.getBytes(StandardCharsets.ISO_8859_1)));
assertFalse(parser.isError());
assertTrue(parser.isFinished());
assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
return parser;
}
}

0 comments on commit 62dffb1

Please sign in to comment.