-
Notifications
You must be signed in to change notification settings - Fork 0
/
nutch-site.xml
2936 lines (2564 loc) · 96.2 KB
/
nutch-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Do not modify this file directly. Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
<configuration>
<!-- general properties -->
<property>
<name>store.ip.address</name>
<value>false</value>
<description>Enables us to capture the specific IP address
(InetSocketAddress) of the host which we connect to via
the given protocol. Currently supported is protocol-ftp and
http.
</description>
</property>
<!-- file properties -->
<property>
<name>file.content.limit</name>
<value>65536</value>
<description>The length limit for downloaded content using the file://
protocol, in bytes. If this value is nonnegative (>=0), content longer
than it will be truncated; otherwise, no truncation at all. Do not
confuse this setting with the http.content.limit setting.
</description>
</property>
<property>
<name>file.crawl.parent</name>
<value>false</value>
<description>The crawler is not restricted to the directories that you specified in the
Urls file but it is jumping into the parent directories as well. For your own crawlings you can
change this behavior (set to false) the way that only directories beneath the directories that you specify get
crawled.</description>
</property>
<property>
<name>file.crawl.redirect_noncanonical</name>
<value>false</value>
<description>
If true, protocol-file treats non-canonical file names as
redirects and does not canonicalize file names internally. A file
name containing symbolic links as path elements is then not
resolved and "fetched" but recorded as redirect with the
canonical name (all links on path are resolved) as redirect
target.
</description>
</property>
<property>
<name>file.content.ignored</name>
<value>true</value>
<description>If true, no file content will be saved during fetch.
And it is probably what we want to set most of time, since file:// URLs
are meant to be local and we can always use them directly at parsing
and indexing stages. Otherwise file contents will be saved.
!! NO IMPLEMENTED YET !!
</description>
</property>
<!-- HTTP properties -->
<property>
<name>http.agent.name</name>
<value>Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML\, like Gecko) Chrome/3.0.198</value>
<description>HTTP 'User-Agent' request header. MUST NOT be empty -
please set this to a single word uniquely related to your organization.
NOTE: You should also check other related properties:
http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version
and set their values appropriately.
</description>
</property>
<property>
<name>http.robots.agents</name>
<value></value>
<description>Any other agents, apart from 'http.agent.name', that the robots
parser would look for in robots.txt. Multiple agents can be provided using
comma as a delimiter. eg. mybot,foo-spider,bar-crawler
The ordering of agents does NOT matter and the robots parser would make
decision based on the agent which matches first to the robots rules.
Also, there is NO need to add a wildcard (ie. "*") to this string as the
robots parser would smartly take care of a no-match situation.
If no value is specified, by default HTTP agent (ie. 'http.agent.name')
would be used for user agent matching by the robots parser.
</description>
</property>
<property>
<name>http.robot.rules.whitelist</name>
<value></value>
<description>Comma separated list of hostnames or IP addresses to ignore
robot rules parsing for. Use with care and only if you are explicitly
allowed by the site owner to ignore the site's robots.txt!
</description>
</property>
<property>
<name>ignore.robots.txt</name>
<value>true</value>
<description>If true, the crawler ignores any robots.txt file on the server</description>
</property>
<property>
<name>http.robots.403.allow</name>
<value>true</value>
<description>Some servers return HTTP status 403 (Forbidden) if
/robots.txt doesn't exist. This should probably mean that we are
allowed to crawl the site nonetheless. If this is set to false,
then such sites will be treated as forbidden.</description>
</property>
<property>
<name>http.agent.description</name>
<value></value>
<description>Further description of our bot- this text is used in
the User-Agent header. It appears in parenthesis after the agent name.
</description>
</property>
<property>
<name>http.agent.url</name>
<value></value>
<description>A URL to advertise in the User-Agent header. This will
appear in parenthesis after the agent name. Custom dictates that this
should be a URL of a page explaining the purpose and behavior of this
crawler.
</description>
</property>
<property>
<name>http.agent.email</name>
<value></value>
<description>An email address to advertise in the HTTP 'From' request
header and User-Agent header. A good practice is to mangle this
address (e.g. 'info at example dot com') to avoid spamming.
</description>
</property>
<property>
<name>http.agent.version</name>
<value></value>
<description>A version string to advertise in the User-Agent
header.</description>
</property>
<property>
<name>http.agent.rotate</name>
<value>true</value>
<description>
If true, instead of http.agent.name, alternating agent names are
chosen from a list provided via http.agent.rotate.file.
</description>
</property>
<property>
<name>http.agent.rotate.file</name>
<value>agents.txt</value>
<description>
File containing alternative user agent names to be used instead of
http.agent.name on a rotating basis if http.agent.rotate is true.
Each line of the file should contain exactly one agent
specification including name, version, description, URL, etc.
</description>
</property>
<property>
<name>http.agent.host</name>
<value></value>
<description>Name or IP address of the host on which the Nutch crawler
would be running. Currently this is used by 'protocol-httpclient'
plugin.
</description>
</property>
<property>
<name>http.timeout</name>
<value>60000</value>
<description>The default network timeout, in milliseconds.</description>
</property>
<property>
<name>http.max.delays</name>
<value>100</value>
<description>The number of times a thread will delay when trying to
fetch a page. Each time it finds that a host is busy, it will wait
fetcher.server.delay. After http.max.delays attempts, it will give
up on the page for now.</description>
</property>
<property>
<name>http.content.limit</name>
<value>65536</value>
<description>The length limit for downloaded content using the http://
protocol, in bytes. If this value is nonnegative (>=0), content longer
than it will be truncated; otherwise, no truncation at all. Do not
confuse this setting with the file.content.limit setting.
</description>
</property>
<property>
<name>http.time.limit</name>
<value>-1</value>
<description>The time limit in seconds to fetch a single document.
If this value is nonnegative (>=0), the HTTP protocol implementation
will stop reading from a socket after http.time.limit seconds have
been spent for fetching this document. The HTTP response is then
marked as truncated. The http.time.limit should be set to a longer
time period than http.timeout, as it applies to the entire duration
to fetch a document, not only the network timeout of a single I/O
operation. Note: supported only by protocol-okhttp.
</description>
</property>
<property>
<name>http.partial.truncated</name>
<value>false</value>
<description>
If true the HTTP protocol implementation may store the content of
partial fetches and mark the response as truncated instead of
throwing an exception which will cause the fetch to fail. This
allows to use the data which has already been fetched, instead of
retrying the fetch later. Note: supported only by protocol-okhttp.
</description>
</property>
<property>
<name>http.proxy.host</name>
<value></value>
<description>The proxy hostname. If empty, no proxy is used.</description>
</property>
<property>
<name>http.proxy.port</name>
<value>8118</value>
<description>The proxy port.</description>
</property>
<property>
<name>http.proxy.username</name>
<value></value>
<description>Username for proxy. This will be used by
'protocol-httpclient', if the proxy server requests basic, digest
and/or NTLM authentication. To use this, 'protocol-httpclient' must
be present in the value of 'plugin.includes' property.
NOTE: For NTLM authentication, do not prefix the username with the
domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
</description>
</property>
<property>
<name>http.proxy.password</name>
<value></value>
<description>Password for proxy. This will be used by
'protocol-httpclient', if the proxy server requests basic, digest
and/or NTLM authentication. To use this, 'protocol-httpclient' must
be present in the value of 'plugin.includes' property.
</description>
</property>
<property>
<name>http.proxy.realm</name>
<value></value>
<description>Authentication realm for proxy. Do not define a value
if realm is not required or authentication should take place for any
realm. NTLM does not use the notion of realms. Specify the domain name
of NTLM authentication as the value for this property. To use this,
'protocol-httpclient' must be present in the value of
'plugin.includes' property.
</description>
</property>
<property>
<name>http.auth.file</name>
<value>httpclient-auth.xml</value>
<description>Authentication configuration file for
'protocol-httpclient' plugin.
</description>
</property>
<property>
<name>http.proxy.type</name>
<value>HTTP</value>
<description>
Proxy type: HTTP or SOCKS (cf. java.net.Proxy.Type).
Note: supported by protocol-okhttp.
</description>
</property>
<property>
<name>http.proxy.exception.list</name>
<value></value>
<description>A comma separated list of hosts that don't use the proxy
(e.g. intranets). Example: www.apache.org</description>
</property>
<property>
<name>http.verbose</name>
<value>true</value>
<description>If true, HTTP will log more verbosely.</description>
</property>
<property>
<name>http.redirect.max</name>
<value>3</value>
<description>The maximum number of redirects the fetcher will follow when
trying to fetch a page. If set to negative or 0, fetcher won't immediately
follow redirected URLs, instead it will record them for later fetching.
</description>
</property>
<property>
<name>http.useHttp11</name>
<value>true</value>
<description>
If true, use HTTP 1.1, if false use HTTP 1.0 .
</description>
</property>
<property>
<name>http.useHttp2</name>
<value>false</value>
<description>
If true try HTTP/2 and fall-back to HTTP/1.1 if HTTP/2 not
supported, if false use always HTTP/1.1.
NOTE: HTTP/2 is currently only supported by protocol-okhttp and
requires at runtime Java 9 or a modified Java 8 with support for
ALPN (Application Layer Protocol Negotiation).
</description>
</property>
<property>
<name>http.accept.language</name>
<value>ru;q=0.7,*;q=0.3</value>
<description>Value of the "Accept-Language" request header field.
This allows selecting non-English language as default one to retrieve.
It is a useful setting for search engines build for certain national group.
To send requests without "Accept-Language" header field, thi property must
be configured to contain a space character because an empty property does
not overwrite the default.
</description>
</property>
<property>
<name>http.accept</name>
<value>text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
<description>Value of the "Accept" request header field. A space character
as value will cause that no "Accept" header field is sent in the request.
</description>
</property>
<property>
<name>http.accept.charset</name>
<value>utf-8,iso-8859-1;q=0.7,*;q=0.7</value>
<description>Value of the "Accept-Charset" request header field. A space character
as value will cause that no "Accept-Charset" header field is sent in the request.
</description>
</property>
<property>
<name>http.store.responsetime</name>
<value>true</value>
<description>Enables us to record the response time of the
host which is the time period between start connection to end
connection of a pages host. The response time in milliseconds
is stored in CrawlDb in CrawlDatum's meta data under key "_rs_"
</description>
</property>
<property>
<name>http.enable.if.modified.since.header</name>
<value>true</value>
<description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces
bandwidth when enabled by not downloading pages that respond with an HTTP
Not-Modified header. URL's that are not downloaded are not passed through
parse or indexing filters. If you regularly modify filters, you should force
Nutch to also download unmodified pages by disabling this feature.
</description>
</property>
<property>
<name>http.enable.cookie.header</name>
<value>true</value>
<description>Whether Nutch sends an HTTP Cookie header. The cookie value
is read from the CrawlDatum Cookie metadata field.
</description>
</property>
<!-- FTP properties -->
<property>
<name>ftp.username</name>
<value>anonymous</value>
<description>ftp login username.</description>
</property>
<property>
<name>ftp.password</name>
<value>anonymous@example.com</value>
<description>ftp login password.</description>
</property>
<property>
<name>ftp.content.limit</name>
<value>65536</value>
<description>The length limit for downloaded content, in bytes.
If this value is nonnegative (>=0), content longer than it will be truncated;
otherwise, no truncation at all.
Caution: classical ftp RFCs never defines partial transfer and, in fact,
some ftp servers out there do not handle client side forced close-down very
well. Our implementation tries its best to handle such situations smoothly.
</description>
</property>
<property>
<name>ftp.timeout</name>
<value>60000</value>
<description>Default timeout for ftp client socket, in millisec.
Please also see ftp.keep.connection below.</description>
</property>
<property>
<name>ftp.server.timeout</name>
<value>100000</value>
<description>An estimation of ftp server idle time, in millisec.
Typically it is 120000 millisec for many ftp servers out there.
Better be conservative here. Together with ftp.timeout, it is used to
decide if we need to delete (annihilate) current ftp.client instance and
force to start another ftp.client instance anew. This is necessary because
a fetcher thread may not be able to obtain next request from queue in time
(due to idleness) before our ftp client times out or remote server
disconnects. Used only when ftp.keep.connection is true (please see below).
</description>
</property>
<property>
<name>ftp.keep.connection</name>
<value>false</value>
<description>Whether to keep ftp connection. Useful if crawling same host
again and again. When set to true, it avoids connection, login and dir list
parser setup for subsequent urls. If it is set to true, however, you must
make sure (roughly):
(1) ftp.timeout is less than ftp.server.timeout
(2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
Otherwise there will be too many "delete client because idled too long"
messages in thread logs.</description>
</property>
<property>
<name>ftp.follow.talk</name>
<value>false</value>
<description>Whether to log dialogue between our client and remote
server. Useful for debugging.</description>
</property>
<!-- web db properties -->
<property>
<name>db.fetch.interval.default</name>
<value>604800</value>
<description>The default number of seconds between re-fetches of a page (7 days).
</description>
</property>
<property>
<name>db.fetch.interval.max</name>
<value>7776000</value>
<description>The maximum number of seconds between re-fetches of a page
(90 days). After this period every page in the db will be re-tried, no
matter what is its status.
</description>
</property>
<property>
<name>db.fetch.schedule.class</name>
<value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
<description>The implementation of fetch schedule. DefaultFetchSchedule simply
adds the original fetchInterval to the last fetch time, regardless of
page changes, whereas AdaptiveFetchSchedule (see below) tries to adapt
to the rate at which a given page is changed.
</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.inc_rate</name>
<value>0.4</value>
<description>If a page is unmodified, its fetchInterval will be
increased by this rate. This value should not
exceed 0.5, otherwise the algorithm becomes unstable.</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.dec_rate</name>
<value>0.2</value>
<description>If a page is modified, its fetchInterval will be
decreased by this rate. This value should not
exceed 0.5, otherwise the algorithm becomes unstable.</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.min_interval</name>
<value>60.0</value>
<description>Minimum fetchInterval, in seconds.</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.max_interval</name>
<value>31536000.0</value>
<description>Maximum fetchInterval, in seconds (365 days).
NOTE: this is limited by db.fetch.interval.max. Pages with
fetchInterval larger than db.fetch.interval.max
will be fetched anyway.</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.sync_delta</name>
<value>true</value>
<description>If true, try to synchronize with the time of page change.
by shifting the next fetchTime by a fraction (sync_rate) of the difference
between the last modification time, and the last fetch time.</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.sync_delta_rate</name>
<value>0.3</value>
<description>See sync_delta for description. This value should not
exceed 0.5, otherwise the algorithm becomes unstable.</description>
</property>
<property>
<name>db.fetch.schedule.mime.file</name>
<value>adaptive-mimetypes.txt</value>
<description>The configuration file for the MimeAdaptiveFetchSchedule.
</description>
</property>
<property>
<name>db.update.additions.allowed</name>
<value>true</value>
<description>If true, updatedb will add newly discovered URLs, if false
only already existing URLs in the CrawlDb will be updated and no new
URLs will be added.
</description>
</property>
<property>
<name>db.preserve.backup</name>
<value>true</value>
<description>If true, updatedb will keep a backup of the previous CrawlDB
version in the old directory. In case of disaster, one can rename old to
current and restore the CrawlDB to its previous state.
</description>
</property>
<property>
<name>db.update.purge.404</name>
<value>true</value>
<description>If true, updatedb will add purge records with status DB_GONE
from the CrawlDB.
</description>
</property>
<property>
<name>db.update.purge.orphans</name>
<value>false</value>
<description>If true, updatedb will permanently delete URL's marked
as orphan from the CrawlDb. The plugin scoring-orphan needs to be
activated to get records marked as orphan. See the plugin's options
elsewhere in this document.
</description>
</property>
<property>
<name>crawldb.url.normalizers</name>
<value>false</value>
<description>
!Temporary, can be overwritten with the command line!
Normalize urls when updating crawldb
</description>
</property>
<property>
<name>crawldb.url.filters</name>
<value>false</value>
<description>
!Temporary, can be overwritten with the command line!
Filter urls when updating crawldb
</description>
</property>
<property>
<name>db.update.max.inlinks</name>
<value>10000</value>
<description>Maximum number of inlinks to take into account when updating
a URL score in the crawlDB. Only the best scoring inlinks are kept.
</description>
</property>
<property>
<name>db.ignore.internal.links</name>
<value>true</value>
<description>If true, outlinks leading from a page to internal hosts or domain
will be ignored. This is an effective way to limit the crawl to include
only initially injected hosts or domains, without creating complex URLFilters.
See 'db.ignore.external.links.mode'.
</description>
</property>
<property>
<name>db.ignore.external.links</name>
<value>true</value>
<description>If true, outlinks leading from a page to external hosts or domain
will be ignored. This is an effective way to limit the crawl to include
only initially injected hosts or domains, without creating complex URLFilters.
See 'db.ignore.external.links.mode'.
</description>
</property>
<property>
<name>db.ignore.also.redirects</name>
<value>false</value>
<description>If true, the fetcher checks redirects the same way as
links when ignoring internal or external links. Set to false to
follow redirects despite the values for db.ignore.external.links and
db.ignore.internal.links.
</description>
</property>
<property>
<name>db.ignore.external.links.mode</name>
<value>byHost</value>
<description>Alternative value is byDomain</description>
</property>
<property>
<name>db.ignore.external.exemptions.file</name>
<value>db-ignore-external-exemptions.txt</value>
<description>
This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin
</description>
</property>
<property>
<name>db.injector.overwrite</name>
<value>false</value>
<description>Whether existing records in the CrawlDB will be overwritten
by injected records.
</description>
</property>
<property>
<name>db.injector.update</name>
<value>false</value>
<description>If true existing records in the CrawlDB will be updated with
injected records. Old meta data is preserved. The db.injector.overwrite
parameter has precedence.
</description>
</property>
<property>
<name>db.score.injected</name>
<value>1.0</value>
<description>The score of new pages added by the injector.
</description>
</property>
<property>
<name>db.score.link.external</name>
<value>1.0</value>
<description>The score factor for new pages added due to a link from
another host relative to the referencing page's score. Scoring plugins
may use this value to affect initial scores of external links.
</description>
</property>
<property>
<name>db.score.link.internal</name>
<value>1.0</value>
<description>The score factor for pages added due to a link from the
same host, relative to the referencing page's score. Scoring plugins
may use this value to affect initial scores of internal links.
</description>
</property>
<property>
<name>db.score.count.filtered</name>
<value>false</value>
<description>The score value passed to newly discovered pages is
calculated as a fraction of the original page score divided by the
number of outlinks. If this option is false, only the outlinks that passed
URLFilters will count, if it's true then all outlinks will count.
</description>
</property>
<property>
<name>db.max.outlinks.per.page</name>
<value>5</value>
<description>The maximum number of outlinks that we'll process for a page.
If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
will be processed for a page; otherwise, all outlinks will be processed.
</description>
</property>
<property>
<name>db.max.outlink.length</name>
<value>4096</value>
<description>
The maximum length in characters accepted for outlinks before
applying URL normalizers and filters. If this value is
nonnegative (>=0), only URLs with a length in characters less or
equal than db.max.outlink.length are accepted and then passed to
URL normalizers and filters. Doing the length check beforehand
avoids that normalizers or filters hang up on overlong URLs.
Note: this property is only used to check URLs found as outlinks
and redirects, but not for injected URLs.
</description>
</property>
<property>
<name>db.parsemeta.to.crawldb</name>
<value></value>
<description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
</description>
</property>
<property>
<name>db.fetch.retry.max</name>
<value>3</value>
<description>The maximum number of times a url that has encountered
recoverable errors is generated for fetch.</description>
</property>
<property>
<name>db.signature.class</name>
<value>org.apache.nutch.crawl.MD5Signature</value>
<description>The default implementation of a page signature. Signatures
created with this implementation will be used for duplicate detection
and removal.</description>
</property>
<property>
<name>db.signature.text_profile.min_token_len</name>
<value>2</value>
<description>Minimum token length to be included in the signature.
</description>
</property>
<property>
<name>db.signature.text_profile.quant_rate</name>
<value>0.01</value>
<description>Profile frequencies will be rounded down to a multiple of
QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
for longer texts tokens with frequency 1 will always be discarded.
</description>
</property>
<property>
<name>db.stats.score.quantiles</name>
<value>.01,.05,.1,.2,.25,.3,.4,.5,.6,.7,.75,.8,.9,.95,.99</value>
<description>
Quantiles of the distribution of CrawlDatum scores shown in the
CrawlDb statistics (command `readdb -stats'). Comma-separated
list of floating point numbers.
</description>
</property>
<!-- linkdb properties -->
<property>
<name>linkdb.max.inlinks</name>
<value>1</value>
<description>Maximum number of Inlinks per URL to be kept in LinkDb.
If "invertlinks" finds more inlinks than this number, only the first
N inlinks will be stored, and the rest will be discarded.
</description>
</property>
<property>
<name>linkdb.ignore.internal.links</name>
<value>true</value>
<description>If true, when adding new links to a page, links from
the same host are ignored. This is an effective way to limit the
size of the link database, keeping only the highest quality
links.
</description>
</property>
<property>
<name>linkdb.ignore.external.links</name>
<value>true</value>
<description>If true, when adding new links to a page, links from
the a different host are ignored.
</description>
</property>
<property>
<name>linkdb.max.anchor.length</name>
<value>100</value>
<description>
The maximum number of characters permitted for anchor texts stored
in LinkDb.
</description>
</property>
<!-- generate properties -->
<property>
<name>generate.max.count</name>
<value>-1</value>
<description>The maximum number of urls in a single
fetchlist. -1 if unlimited. The urls are counted according
to the value of the parameter generator.count.mode.
</description>
</property>
<property>
<name>generate.count.mode</name>
<value>host</value>
<description>Determines how the URLs are counted for generator.max.count.
Default value is 'host' but can be 'domain'. Note that we do not count
per IP in the new version of the Generator.
</description>
</property>
<property>
<name>generate.update.crawldb</name>
<value>true</value>
<description>For highly-concurrent environments, where several
generate/fetch/update cycles may overlap, setting this to true ensures
that generate will create different fetchlists even without intervening
updatedb-s, at the cost of running an additional job to update CrawlDB.
If false, running generate twice without intervening
updatedb will generate identical fetchlists.</description>
</property>
<property>
<name>generate.min.score</name>
<value>0</value>
<description>Select only entries with a score larger than
generate.min.score.</description>
</property>
<property>
<name>generate.min.interval</name>
<value>-1</value>
<description>Select only entries with a retry interval lower than
generate.min.interval. A value of -1 disables this check.</description>
</property>
<property>
<name>generate.hostdb</name>
<value></value>
<description>Path to HostDB, required for the generate.max.count.expr
and generate.fetch.delay.expr properties.
See https://issues.apache.org/jira/browse/NUTCH-2368</description>
</property>
<property>
<name>generate.fetch.delay.expr</name>
<value></value>
<description>Controls variable fetcher.server.delay via a Jexl expression and
HostDB information. It allows you to alter fetch delay based on HostDB data.
See https://issues.apache.org/jira/browse/NUTCH-2368</description>
</property>
<property>
<name>generate.max.count.expr</name>
<value></value>
<description>Controls variable generate.max.count via a Jexl expression and
HostDB information. It allows you to alter maxCount based on HostDB data.
See https://issues.apache.org/jira/browse/NUTCH-2368</description>
</property>
<!-- urlpartitioner properties -->
<property>
<name>partition.url.mode</name>
<value>byHost</value>
<description>Determines how to partition URLs. Default value is 'byHost',
also takes 'byDomain' or 'byIP'.
</description>
</property>
<property>
<name>crawl.gen.delay</name>
<value>604800000</value>
<description>
This value, expressed in milliseconds, defines how long we should keep the lock on records
in CrawlDb that were just selected for fetching. If these records are not updated
in the meantime, the lock is canceled, i.e. they become eligible for selecting.
Default value of this is 7 days (604800000 ms).
</description>
</property>
<!-- fetcher properties -->
<property>
<name>fetcher.server.delay</name>
<value>5.0</value>
<description>The number of seconds the fetcher will delay between
successive requests to the same server. Note that this might get
overridden by a Crawl-Delay from a robots.txt and is used ONLY if
fetcher.threads.per.queue is set to 1.
</description>
</property>
<property>
<name>fetcher.server.min.delay</name>
<value>5.0</value>
<description>The minimum number of seconds the fetcher will delay between
successive requests to the same server. This value is applicable ONLY
if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking
is turned off).</description>
</property>
<property>
<name>fetcher.max.crawl.delay</name>
<value>30</value>
<description>
If the Crawl-Delay in robots.txt is set to greater than this value (in
seconds) then the fetcher will skip this page, generating an error report.
If set to -1 the fetcher will never skip such pages and will wait the
amount of time retrieved from robots.txt Crawl-Delay, however long that
might be.
</description>
</property>
<property>
<name>fetcher.threads.fetch</name>
<value>20</value>
<description>The number of FetcherThreads the fetcher should use.
This is also determines the maximum number of requests that are
made at once (each FetcherThread handles one connection). The total
number of threads running in distributed mode will be the number of
fetcher threads * number of nodes as fetcher has one map task per node.
</description>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>20</value>
<description>This number is the maximum number of threads that
should be allowed to access a queue at one time. Setting it to
a value > 1 will cause the Crawl-Delay value from robots.txt to
be ignored and the value of fetcher.server.min.delay to be used
as a delay between successive requests to the same server instead
of fetcher.server.delay.
</description>
</property>
<property>
<name>fetcher.queue.mode</name>
<value>byHost</value>
<description>Determines how to put URLs into queues. Default value is 'byHost',
also takes 'byDomain' or 'byIP'.
</description>
</property>
<property>
<name>fetcher.verbose</name>
<value>true</value>
<description>If true, fetcher will log more verbosely.</description>
</property>
<property>
<name>fetcher.parse</name>