-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndexFiles.java
154 lines (142 loc) · 5.61 KB
/
IndexFiles.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class IndexFiles {
/** SET THE VARIABLE HERE */
// Corpus Path of text files to be indexed
public static final String CORPUS_PATH = "corpus-example/";
// Path for index result
public static final String INDEX_PATH = "index-example/";
// True if you want to use stemmer, false otherwise
public static final boolean USE_STEMMER = false;
// True if you want to use stopword removal, false otherwise
public static final boolean USE_STOPWORD = false;
public static void main(String[] args) {
try {
Directory indexDir = FSDirectory.open(Paths.get(INDEX_PATH));
Analyzer analyzer = new CustomizedIndonesianAnalyzer(USE_STEMMER, USE_STOPWORD);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(OpenMode.CREATE);
IndexWriter indexWriter = new IndexWriter(indexDir, config);
Path docDir = Paths.get(CORPUS_PATH);
indexDocs(indexWriter, docDir);
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Indexes the given file using the given writer, or if a directory is
* given, recurses over files and directories found under the given
* directory.
*
* NOTE: This method indexes one document per input file. This is slow. For
* good throughput, put multiple documents into your input file(s). An
* example of this is in the benchmark module, which can create "line doc"
* files, one document per line, using the <a href=
* "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
* >WriteLineDocTask</a>.
*
* @param writer
* Writer to the index where the given file/dir info will be
* stored
* @param path
* The file to index, or the directory to recurse into to find
* files to index
* @throws IOException
* If there is a low-level I/O error
*/
public static void indexDocs(final IndexWriter writer, Path path)
throws IOException {
if (Files.isDirectory(path)) {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file,
BasicFileAttributes attrs) throws IOException {
try {
indexDoc(writer, file, attrs.lastModifiedTime()
.toMillis());
} catch (IOException ignore) {
// don't index files that can't be read.
ignore.printStackTrace();
}
return FileVisitResult.CONTINUE;
}
});
} else {
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified)
throws IOException {
try (InputStream stream = Files.newInputStream(file)) {
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a
// field that is indexed (i.e. searchable), but don't tokenize
// the field into separate words and don't index term frequency
// or positional information:
Field pathField = new StringField("path", file.toString(),
Field.Store.YES);
doc.add(pathField);
// Add the last modified date of the file a field named "modified".
// Use a LongField that is indexed (i.e. efficiently filterable with
// NumericRangeFilter). This indexes to milli-second resolution,
// which
// is often too fine. You could instead create a number based on
// year/month/day/hour/minutes/seconds, down the resolution you
// require.
// For example the long value 2011021714 would mean
// February 17, 2011, 2-3 PM.
doc.add(new LongField("modified", lastModified, Field.Store.NO));
// Add the contents of the file to a field named "contents". Specify
// a Reader,
// so that the text of the file is tokenized and indexed, but not
// stored.
// Note that FileReader expects the file to be in UTF-8 encoding.
// If that's not the case searching for special characters will
// fail.
doc.add(new TextField("contents", new BufferedReader(
new InputStreamReader(stream, StandardCharsets.UTF_8))));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// New index, so we just add the document (no old document can
// be there):
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
// Existing index (an old copy of this document may have been
// indexed) so
// we use updateDocument instead to replace the old one matching
// the exact
// path, if present:
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
}
}