Update Lindera and Tantivy (#37)

* Update Lindera and Tantivy * Update CHANGES.md * Update workflows * Update examples and bench * Fix lint * Remove example * Update test
lindera · Mar 9, 2022 · d2c45c7 · d2c45c7
1 parent d97a542
commit d2c45c7
Show file tree

Hide file tree

Showing 12 changed files with 467 additions and 107 deletions.
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -11,6 +11,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, macOS-latest, windows-latest]
         toolchain: [stable, beta, nightly]
+        features: ["ipadic", "ko-dic", "cc-cedict"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v1
@@ -22,3 +23,4 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: test
+          args: --features "${{ matrix.features }}"
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -32,6 +32,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, macOS-latest, windows-latest]
         toolchain: [stable]
+        features: ["ipadic", "ko-dic", "cc-cedict"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v1
@@ -43,6 +44,7 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: test
+          args: --features "${{ matrix.features }}"
 
   fmt:
     name: Format

diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,9 @@
 All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## 0.11.1 (2022-03-09)
+- Update Lindera and Tantivy #37 @mosuka
+
 ## 0.10.0 (2022-02-25)
 - Update lindera to 0.10.0 #32 @mosuka
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,8 @@
 [package]
 name = "lindera-tantivy"
-version = "0.10.0"
+version = "0.11.1"
 edition = "2021"
-description = "A Tokenizer for Tantivy, based on Lindera."
+description = "Lindera Tokenizer for Tantivy."
 documentation = "https://docs.rs/lindera-tantivy"
 homepage = "https://github.com/lindera-morphology/lindera-tantivy"
 repository = "https://github.com/lindera-morphology/lindera-tantivy"
@@ -11,11 +11,19 @@ keywords = ["tokenizer", "tantivy", "lindera"]
 categories = ["text-processing"]
 license = "MIT"
 
+[features]
+default = []
+full = ["ipadic", "unidic", "ko-dic", "cc-cedict"]
+ipadic = ["lindera/ipadic"]  # Japanese dictionary
+unidic = ["lindera/unidic"]  # Japanese dictionary
+ko-dic = ["lindera/ko-dic"]  # Korean dictionary
+cc-cedict = ["lindera/cc-cedict"]  # Chinese dictionary
+
 [dependencies]
-tantivy = "0.16"
+tantivy = "0.17"
 
-lindera = "0.10.0"
-lindera-core = "0.10.0"
+lindera = "0.11.1"
+lindera-core = "0.11.1"
 
 [dev-dependencies]
 criterion = "0.3"

diff --git a/Makefile b/Makefile
@@ -8,9 +8,6 @@ clean:
 format:
 	cargo fmt
 
-build:
-	cargo build --release
-
 test:
 	cargo test
 

diff --git a/README.md b/README.md
@@ -2,19 +2,24 @@
 
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Join the chat at https://gitter.im/lindera-morphology/lindera](https://badges.gitter.im/lindera-morphology/lindera.svg)](https://gitter.im/lindera-morphology/lindera?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
-A Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy), based on [Lindera](https://github.com/lindera-morphology/lindera).
+[Lindera](https://github.com/lindera-morphology/lindera) Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy).
 
-## Build
 
-The following products are required to build:
+## Usage
 
-- Rust >= 1.46.0
+Make sure you have activated the required dictionaries for the 　Lindera in Cargo.toml.
+The following example enables IPADIC.
 
-```text
-% cargo build --release
+```
+[dependencies]
+lindera = { version = "0.11.1", features = ["ipadic"] }
 ```
 
-## Usage
+- ipadic: Japanese dictionary
+- unidic: Japanese dictionary
+- ko-dic: Korean dictionary
+- cc-cedict: Chinese dictionary
+
 
 ### Basic example
 
@@ -24,7 +29,7 @@ use tantivy::query::QueryParser;
 use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
 use tantivy::{doc, Index};
 
-use lindera::tokenizer::{TokenizerConfig, UserDictionaryType};
+use lindera::tokenizer::{TokenizerConfig, UserDictionaryType, DictionaryType};
 use lindera_core::viterbi::{Mode, Penalty};
 use lindera_tantivy::tokenizer::LinderaTokenizer;
 
@@ -75,9 +80,10 @@ fn main() -> tantivy::Result<()> {
     let index = Index::create_in_ram(schema.clone());
 
     let config = TokenizerConfig {
+        dict_type: DictionaryType::Ipadic,
         dict_path: None,
         user_dict_path: None,
-        user_dict_type: UserDictionaryType::CSV,
+        user_dict_type: UserDictionaryType::Csv,
         mode: Mode::Decompose(Penalty::default()),
     };
 
@@ -93,22 +99,22 @@ fn main() -> tantivy::Result<()> {
     index_writer.add_document(doc!(
     id => "1",
     title => "成田国際空港",
-    body => "成田国際空港（なりたこくさいくうこう、英: Narita International Airport）は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である[1]。首都圏東部（東京の東60km）に位置している。空港コードはNRT。"
-    ));
+    body => "成田国際空港（なりたこくさいくうこう、英: Narita International Airport）は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部（東京の東60km）に位置している。空港コードはNRT。"
+    )).unwrap();
 
     // add document
     index_writer.add_document(doc!(
     id => "2",
     title => "東京国際空港",
     body => "東京国際空港（とうきょうこくさいくうこう、英語: Tokyo International Airport）は、東京都大田区にある日本最大の空港。通称は羽田空港（はねだくうこう、英語: Haneda Airport）であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
-    ));
+    )).unwrap();
 
     // add document
     index_writer.add_document(doc!(
     id => "3",
     title => "関西国際空港",
     body => "関西国際空港（かんさいこくさいくうこう、英: Kansai International Airport）は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港（伊丹空港）、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
-    ));
+    )).unwrap();
 
     // commit
     index_writer.commit()?;

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -1,17 +1,19 @@
 use criterion::Criterion;
 use criterion::{criterion_group, criterion_main};
-use tantivy::doc;
-use tantivy::schema::IndexRecordOption;
-use tantivy::schema::Schema;
-use tantivy::schema::TextFieldIndexing;
-use tantivy::schema::TextOptions;
-use tantivy::Index;
-
-use lindera::tokenizer::{TokenizerConfig, UserDictionaryType};
-use lindera_core::viterbi::{Mode, Penalty};
-use lindera_tantivy::tokenizer::LinderaTokenizer;
 
+#[cfg(feature = "ipadic")]
 fn bench_indexing(c: &mut Criterion) {
+    use tantivy::doc;
+    use tantivy::schema::IndexRecordOption;
+    use tantivy::schema::Schema;
+    use tantivy::schema::TextFieldIndexing;
+    use tantivy::schema::TextOptions;
+    use tantivy::Index;
+
+    use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType};
+    use lindera_core::viterbi::{Mode, Penalty};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
+
     // create schema builder
     let mut schema_builder = Schema::builder();
 
@@ -46,9 +48,10 @@ fn bench_indexing(c: &mut Criterion) {
     let index = Index::create_in_ram(schema.clone());
 
     let config = TokenizerConfig {
+        dict_type: DictionaryType::Ipadic,
         dict_path: None,
         user_dict_path: None,
-        user_dict_type: UserDictionaryType::CSV,
+        user_dict_type: UserDictionaryType::Csv,
         mode: Mode::Decompose(Penalty::default()),
     };
 
@@ -78,5 +81,8 @@ fn bench_indexing(c: &mut Criterion) {
     group.finish();
 }
 
+#[cfg(not(feature = "ipadic"))]
+fn bench_indexing(_c: &mut Criterion) {}
+
 criterion_group!(benches, bench_indexing,);
 criterion_main!(benches);
diff --git a/examples/cc-cedict_example.rs b/examples/cc-cedict_example.rs
@@ -0,0 +1,125 @@
+#[cfg(feature = "cc-cedict")]
+fn main() -> tantivy::Result<()> {
+    use tantivy::collector::TopDocs;
+    use tantivy::query::QueryParser;
+    use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
+    use tantivy::{doc, Index};
+
+    use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType};
+    use lindera_core::viterbi::{Mode, Penalty};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
+
+    // create schema builder
+    let mut schema_builder = Schema::builder();
+
+    // add id field
+    let id = schema_builder.add_text_field(
+        "id",
+        TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_tokenizer("raw")
+                    .set_index_option(IndexRecordOption::Basic),
+            )
+            .set_stored(),
+    );
+
+    // add title field
+    let title = schema_builder.add_text_field(
+        "title",
+        TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_tokenizer("lang_zh")
+                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+            )
+            .set_stored(),
+    );
+
+    // add body field
+    let body = schema_builder.add_text_field(
+        "body",
+        TextOptions::default()
+            .set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_tokenizer("lang_zh")
+                    .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+            )
+            .set_stored(),
+    );
+
+    // build schema
+    let schema = schema_builder.build();
+
+    // create index on memory
+    let index = Index::create_in_ram(schema.clone());
+
+    let config = TokenizerConfig {
+        dict_type: DictionaryType::Cedict,
+        dict_path: None,
+        user_dict_path: None,
+        user_dict_type: UserDictionaryType::Csv,
+        mode: Mode::Decompose(Penalty::default()),
+    };
+
+    // register Lindera tokenizer
+    index
+        .tokenizers()
+        .register("lang_zh", LinderaTokenizer::with_config(config).unwrap());
+
+    // create index writer
+    let mut index_writer = index.writer(50_000_000)?;
+
+    // add document
+    index_writer.add_document(doc!(
+    id => "1",
+    title => "成田国际机场",
+    body => "成田國際機場（日语：成田国際空港／なりたこくさいくうこう Narita Kokusai Kūkō */?；IATA代码：NRT；ICAO代码：RJAA），通稱成田機場（成田空港），原名新東京國際機場（新東京国際空港／しんとうきょうこくさいくうこう Shin-Tōkyō Kokusai Kūkō），是位於日本千葉縣成田市的國際機場，與羽田機場並列為東京兩大聯外機場。占地1,111公頃，擁有3座客運航廈，客運流量居日本第二位，貨運吞吐量則居日本第一、全球第九。根據日本機場分類法，其劃分為據點機場。"
+    )).unwrap();
+
+    // add document
+    index_writer.add_document(doc!(
+    id => "2",
+    title => "東京國際機場",
+    body => "東京國際機場（日语：東京国際空港／とうきょうこくさいくうこう Tōkyō Kokusai Kūkō */?；IATA代码：HND；ICAO代码：RJTT）是位於日本東京都大田區的機場，因座落於羽田地區而通稱為羽田機場（羽田空港／はねだくうこう Haneda Kūkō），啟用於1931年8月25日，與成田國際機場並列為東京兩大聯外機場。"
+    )).unwrap();
+
+    // add document
+    index_writer.add_document(doc!(
+    id => "3",
+    title => "关西国际机场",
+    body => "關西國際機場（日语：関西国際空港／かんさいこくさいくうこう Kansai kokusai kūkō */?，英語：Kansai International Airport，IATA代码：KIX；ICAO代码：RJBB），常通稱為關西機場、大阪關西機場或關空[註 1]，是位於日本大阪府的機場，坐落於大阪湾东南部的泉州近海離岸5公里的人工島上，面積約1,067.7公頃[2]，行政區劃橫跨大阪府的泉佐野市（北）、田尻町（中）以及泉南市（南）。"
+    )).unwrap();
+
+    // commit
+    index_writer.commit()?;
+
+    // create reader
+    let reader = index.reader()?;
+
+    // create searcher
+    let searcher = reader.searcher();
+
+    // create querhy parser
+    let query_parser = QueryParser::for_index(&index, vec![title, body]);
+
+    // parse query
+    let query_str = "東京";
+    let query = query_parser.parse_query(query_str)?;
+    println!("Query String: {}", query_str);
+
+    // search
+    let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
+    println!("Search Result:");
+    for (_, doc_address) in top_docs {
+        let retrieved_doc = searcher.doc(doc_address)?;
+        println!("{}", schema.to_json(&retrieved_doc));
+    }
+
+    Ok(())
+}
+
+#[cfg(not(feature = "cc-cedict"))]
+fn main() -> tantivy::Result<()> {
+    Ok(())
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,9 +8,6 @@ clean: @@
     format:
     	cargo fmt
-    build:
-    	cargo build --release
     test:
     	cargo test
@@ Expand Down @@