From d2c45c7370ee5456821c8a38cdb397f59abbb96d Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Wed, 9 Mar 2022 22:47:34 +0900 Subject: [PATCH] Update Lindera and Tantivy (#37) * Update Lindera and Tantivy * Update CHANGES.md * Update workflows * Update examples and bench * Fix lint * Remove example * Update test --- .github/workflows/periodic.yml | 2 + .github/workflows/regression.yml | 2 + CHANGES.md | 3 + Cargo.toml | 18 ++- Makefile | 3 - README.md | 32 +++-- benches/bench.rs | 28 ++-- examples/cc-cedict_example.rs | 125 ++++++++++++++++++ .../{basic_example.rs => ipadic_example.rs} | 33 +++-- examples/ko-dic_example.rs | 125 ++++++++++++++++++ examples/unidic_example.rs | 125 ++++++++++++++++++ src/tokenizer.rs | 78 +++-------- 12 files changed, 467 insertions(+), 107 deletions(-) create mode 100644 examples/cc-cedict_example.rs rename examples/{basic_example.rs => ipadic_example.rs} (82%) create mode 100644 examples/ko-dic_example.rs create mode 100644 examples/unidic_example.rs diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 5384f31..5661d75 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -11,6 +11,7 @@ jobs: matrix: os: [ubuntu-latest, macOS-latest, windows-latest] toolchain: [stable, beta, nightly] + features: ["ipadic", "ko-dic", "cc-cedict"] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v1 @@ -22,3 +23,4 @@ jobs: - uses: actions-rs/cargo@v1 with: command: test + args: --features "${{ matrix.features }}" diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index eaef7aa..d3f9b07 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -32,6 +32,7 @@ jobs: matrix: os: [ubuntu-latest, macOS-latest, windows-latest] toolchain: [stable] + features: ["ipadic", "ko-dic", "cc-cedict"] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v1 @@ -43,6 +44,7 @@ jobs: - uses: actions-rs/cargo@v1 with: command: test + args: --features "${{ matrix.features }}" fmt: name: Format diff --git a/CHANGES.md b/CHANGES.md index 40bcd9f..ae18520 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## 0.11.1 (2022-03-09) +- Update Lindera and Tantivy #37 @mosuka + ## 0.10.0 (2022-02-25) - Update lindera to 0.10.0 #32 @mosuka diff --git a/Cargo.toml b/Cargo.toml index 9042f42..c14eaec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "lindera-tantivy" -version = "0.10.0" +version = "0.11.1" edition = "2021" -description = "A Tokenizer for Tantivy, based on Lindera." +description = "Lindera Tokenizer for Tantivy." documentation = "https://docs.rs/lindera-tantivy" homepage = "https://github.com/lindera-morphology/lindera-tantivy" repository = "https://github.com/lindera-morphology/lindera-tantivy" @@ -11,11 +11,19 @@ keywords = ["tokenizer", "tantivy", "lindera"] categories = ["text-processing"] license = "MIT" +[features] +default = [] +full = ["ipadic", "unidic", "ko-dic", "cc-cedict"] +ipadic = ["lindera/ipadic"] # Japanese dictionary +unidic = ["lindera/unidic"] # Japanese dictionary +ko-dic = ["lindera/ko-dic"] # Korean dictionary +cc-cedict = ["lindera/cc-cedict"] # Chinese dictionary + [dependencies] -tantivy = "0.16" +tantivy = "0.17" -lindera = "0.10.0" -lindera-core = "0.10.0" +lindera = "0.11.1" +lindera-core = "0.11.1" [dev-dependencies] criterion = "0.3" diff --git a/Makefile b/Makefile index 140459f..46947e9 100644 --- a/Makefile +++ b/Makefile @@ -8,9 +8,6 @@ clean: format: cargo fmt -build: - cargo build --release - test: cargo test diff --git a/README.md b/README.md index a354e2c..c62b179 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,24 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Join the chat at https://gitter.im/lindera-morphology/lindera](https://badges.gitter.im/lindera-morphology/lindera.svg)](https://gitter.im/lindera-morphology/lindera?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -A Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy), based on [Lindera](https://github.com/lindera-morphology/lindera). +[Lindera](https://github.com/lindera-morphology/lindera) Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy). -## Build -The following products are required to build: +## Usage -- Rust >= 1.46.0 +Make sure you have activated the required dictionaries for the  Lindera in Cargo.toml. +The following example enables IPADIC. -```text -% cargo build --release +``` +[dependencies] +lindera = { version = "0.11.1", features = ["ipadic"] } ``` -## Usage +- ipadic: Japanese dictionary +- unidic: Japanese dictionary +- ko-dic: Korean dictionary +- cc-cedict: Chinese dictionary + ### Basic example @@ -24,7 +29,7 @@ use tantivy::query::QueryParser; use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; use tantivy::{doc, Index}; -use lindera::tokenizer::{TokenizerConfig, UserDictionaryType}; +use lindera::tokenizer::{TokenizerConfig, UserDictionaryType, DictionaryType}; use lindera_core::viterbi::{Mode, Penalty}; use lindera_tantivy::tokenizer::LinderaTokenizer; @@ -75,9 +80,10 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); let config = TokenizerConfig { + dict_type: DictionaryType::Ipadic, dict_path: None, user_dict_path: None, - user_dict_type: UserDictionaryType::CSV, + user_dict_type: UserDictionaryType::Csv, mode: Mode::Decompose(Penalty::default()), }; @@ -93,22 +99,22 @@ fn main() -> tantivy::Result<()> { index_writer.add_document(doc!( id => "1", title => "成田国際空港", - body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である[1]。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" - )); + body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" + )).unwrap(); // add document index_writer.add_document(doc!( id => "2", title => "東京国際空港", body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" - )); + )).unwrap(); // add document index_writer.add_document(doc!( id => "3", title => "関西国際空港", body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" - )); + )).unwrap(); // commit index_writer.commit()?; diff --git a/benches/bench.rs b/benches/bench.rs index 05e823d..474fe77 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,17 +1,19 @@ use criterion::Criterion; use criterion::{criterion_group, criterion_main}; -use tantivy::doc; -use tantivy::schema::IndexRecordOption; -use tantivy::schema::Schema; -use tantivy::schema::TextFieldIndexing; -use tantivy::schema::TextOptions; -use tantivy::Index; - -use lindera::tokenizer::{TokenizerConfig, UserDictionaryType}; -use lindera_core::viterbi::{Mode, Penalty}; -use lindera_tantivy::tokenizer::LinderaTokenizer; +#[cfg(feature = "ipadic")] fn bench_indexing(c: &mut Criterion) { + use tantivy::doc; + use tantivy::schema::IndexRecordOption; + use tantivy::schema::Schema; + use tantivy::schema::TextFieldIndexing; + use tantivy::schema::TextOptions; + use tantivy::Index; + + use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType}; + use lindera_core::viterbi::{Mode, Penalty}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + // create schema builder let mut schema_builder = Schema::builder(); @@ -46,9 +48,10 @@ fn bench_indexing(c: &mut Criterion) { let index = Index::create_in_ram(schema.clone()); let config = TokenizerConfig { + dict_type: DictionaryType::Ipadic, dict_path: None, user_dict_path: None, - user_dict_type: UserDictionaryType::CSV, + user_dict_type: UserDictionaryType::Csv, mode: Mode::Decompose(Penalty::default()), }; @@ -78,5 +81,8 @@ fn bench_indexing(c: &mut Criterion) { group.finish(); } +#[cfg(not(feature = "ipadic"))] +fn bench_indexing(_c: &mut Criterion) {} + criterion_group!(benches, bench_indexing,); criterion_main!(benches); diff --git a/examples/cc-cedict_example.rs b/examples/cc-cedict_example.rs new file mode 100644 index 0000000..d8aaf32 --- /dev/null +++ b/examples/cc-cedict_example.rs @@ -0,0 +1,125 @@ +#[cfg(feature = "cc-cedict")] +fn main() -> tantivy::Result<()> { + use tantivy::collector::TopDocs; + use tantivy::query::QueryParser; + use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; + use tantivy::{doc, Index}; + + use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType}; + use lindera_core::viterbi::{Mode, Penalty}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + + // create schema builder + let mut schema_builder = Schema::builder(); + + // add id field + let id = schema_builder.add_text_field( + "id", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic), + ) + .set_stored(), + ); + + // add title field + let title = schema_builder.add_text_field( + "title", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_zh") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // add body field + let body = schema_builder.add_text_field( + "body", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_zh") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // build schema + let schema = schema_builder.build(); + + // create index on memory + let index = Index::create_in_ram(schema.clone()); + + let config = TokenizerConfig { + dict_type: DictionaryType::Cedict, + dict_path: None, + user_dict_path: None, + user_dict_type: UserDictionaryType::Csv, + mode: Mode::Decompose(Penalty::default()), + }; + + // register Lindera tokenizer + index + .tokenizers() + .register("lang_zh", LinderaTokenizer::with_config(config).unwrap()); + + // create index writer + let mut index_writer = index.writer(50_000_000)?; + + // add document + index_writer.add_document(doc!( + id => "1", + title => "成田国际机场", + body => "成田國際機場(日语:成田国際空港/なりたこくさいくうこう Narita Kokusai Kūkō */?;IATA代码:NRT;ICAO代码:RJAA),通稱成田機場(成田空港),原名新東京國際機場(新東京国際空港/しんとうきょうこくさいくうこう Shin-Tōkyō Kokusai Kūkō),是位於日本千葉縣成田市的國際機場,與羽田機場並列為東京兩大聯外機場。占地1,111公頃,擁有3座客運航廈,客運流量居日本第二位,貨運吞吐量則居日本第一、全球第九。根據日本機場分類法,其劃分為據點機場。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "2", + title => "東京國際機場", + body => "東京國際機場(日语:東京国際空港/とうきょうこくさいくうこう Tōkyō Kokusai Kūkō */?;IATA代码:HND;ICAO代码:RJTT)是位於日本東京都大田區的機場,因座落於羽田地區而通稱為羽田機場(羽田空港/はねだくうこう Haneda Kūkō),啟用於1931年8月25日,與成田國際機場並列為東京兩大聯外機場。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "3", + title => "关西国际机场", + body => "關西國際機場(日语:関西国際空港/かんさいこくさいくうこう Kansai kokusai kūkō */?,英語:Kansai International Airport,IATA代码:KIX;ICAO代码:RJBB),常通稱為關西機場、大阪關西機場或關空[註 1],是位於日本大阪府的機場,坐落於大阪湾东南部的泉州近海離岸5公里的人工島上,面積約1,067.7公頃[2],行政區劃橫跨大阪府的泉佐野市(北)、田尻町(中)以及泉南市(南)。" + )).unwrap(); + + // commit + index_writer.commit()?; + + // create reader + let reader = index.reader()?; + + // create searcher + let searcher = reader.searcher(); + + // create querhy parser + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // parse query + let query_str = "東京"; + let query = query_parser.parse_query(query_str)?; + println!("Query String: {}", query_str); + + // search + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + println!("Search Result:"); + for (_, doc_address) in top_docs { + let retrieved_doc = searcher.doc(doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + + Ok(()) +} + +#[cfg(not(feature = "cc-cedict"))] +fn main() -> tantivy::Result<()> { + Ok(()) +} diff --git a/examples/basic_example.rs b/examples/ipadic_example.rs similarity index 82% rename from examples/basic_example.rs rename to examples/ipadic_example.rs index 0e3512a..f21dd04 100644 --- a/examples/basic_example.rs +++ b/examples/ipadic_example.rs @@ -1,13 +1,14 @@ -use tantivy::collector::TopDocs; -use tantivy::query::QueryParser; -use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; -use tantivy::{doc, Index}; +#[cfg(feature = "ipadic")] +fn main() -> tantivy::Result<()> { + use tantivy::collector::TopDocs; + use tantivy::query::QueryParser; + use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; + use tantivy::{doc, Index}; -use lindera::tokenizer::{TokenizerConfig, UserDictionaryType}; -use lindera_core::viterbi::{Mode, Penalty}; -use lindera_tantivy::tokenizer::LinderaTokenizer; + use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType}; + use lindera_core::viterbi::{Mode, Penalty}; + use lindera_tantivy::tokenizer::LinderaTokenizer; -fn main() -> tantivy::Result<()> { // create schema builder let mut schema_builder = Schema::builder(); @@ -54,9 +55,10 @@ fn main() -> tantivy::Result<()> { let index = Index::create_in_ram(schema.clone()); let config = TokenizerConfig { + dict_type: DictionaryType::Ipadic, dict_path: None, user_dict_path: None, - user_dict_type: UserDictionaryType::CSV, + user_dict_type: UserDictionaryType::Csv, mode: Mode::Decompose(Penalty::default()), }; @@ -72,22 +74,22 @@ fn main() -> tantivy::Result<()> { index_writer.add_document(doc!( id => "1", title => "成田国際空港", - body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である[1]。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" - )); + body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" + )).unwrap(); // add document index_writer.add_document(doc!( id => "2", title => "東京国際空港", body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" - )); + )).unwrap(); // add document index_writer.add_document(doc!( id => "3", title => "関西国際空港", body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" - )); + )).unwrap(); // commit index_writer.commit()?; @@ -116,3 +118,8 @@ fn main() -> tantivy::Result<()> { Ok(()) } + +#[cfg(not(feature = "ipadic"))] +fn main() -> tantivy::Result<()> { + Ok(()) +} diff --git a/examples/ko-dic_example.rs b/examples/ko-dic_example.rs new file mode 100644 index 0000000..6d7480d --- /dev/null +++ b/examples/ko-dic_example.rs @@ -0,0 +1,125 @@ +#[cfg(feature = "ko-dic")] +fn main() -> tantivy::Result<()> { + use tantivy::collector::TopDocs; + use tantivy::query::QueryParser; + use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; + use tantivy::{doc, Index}; + + use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType}; + use lindera_core::viterbi::{Mode, Penalty}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + + // create schema builder + let mut schema_builder = Schema::builder(); + + // add id field + let id = schema_builder.add_text_field( + "id", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic), + ) + .set_stored(), + ); + + // add title field + let title = schema_builder.add_text_field( + "title", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ko") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // add body field + let body = schema_builder.add_text_field( + "body", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ko") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // build schema + let schema = schema_builder.build(); + + // create index on memory + let index = Index::create_in_ram(schema.clone()); + + let config = TokenizerConfig { + dict_type: DictionaryType::Kodic, + dict_path: None, + user_dict_path: None, + user_dict_type: UserDictionaryType::Csv, + mode: Mode::Decompose(Penalty::default()), + }; + + // register Lindera tokenizer + index + .tokenizers() + .register("lang_ko", LinderaTokenizer::with_config(config).unwrap()); + + // create index writer + let mut index_writer = index.writer(50_000_000)?; + + // add document + index_writer.add_document(doc!( + id => "1", + title => "나리타 국제공항", + body => "나리타 국제공항(일본어: 成田国際空港, 영어: Narita International Airport, IATA: NRT, ICAO: RJAA)은 일본 지바현 나리타시에 위치한 국제공항으로, 도쿄도 도심에서 동북쪽으로 약 62km 떨어져 있다." + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "2", + title => "도쿄 국제공항", + body => "도쿄국제공항(일본어: 東京国際空港、とうきょうこくさいくうこう, 영어: Tokyo International Airport)은 일본 도쿄도 오타구에 있는 공항이다. 보통 이 일대의 옛 지명을 본뜬 하네다 공항(일본어: 羽田空港, 영어: Haneda Airport)이라고 불린다." + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "3", + title => "간사이 국제공항", + body => "간사이 국제공항(일본어: 関西国際空港, IATA: KIX, ICAO: RJBB)은 일본 오사카부 오사카 만에 조성된 인공섬에 위치한 일본의 공항으로, 대한민국의 인천국제공항보다 6년 반 앞선 1994년 9월 4일에 개항했다." + )).unwrap(); + + // commit + index_writer.commit()?; + + // create reader + let reader = index.reader()?; + + // create searcher + let searcher = reader.searcher(); + + // create querhy parser + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // parse query + let query_str = "도쿄"; + let query = query_parser.parse_query(query_str)?; + println!("Query String: {}", query_str); + + // search + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + println!("Search Result:"); + for (_, doc_address) in top_docs { + let retrieved_doc = searcher.doc(doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + + Ok(()) +} + +#[cfg(not(feature = "ko-dic"))] +fn main() -> tantivy::Result<()> { + Ok(()) +} diff --git a/examples/unidic_example.rs b/examples/unidic_example.rs new file mode 100644 index 0000000..93f4b6a --- /dev/null +++ b/examples/unidic_example.rs @@ -0,0 +1,125 @@ +#[cfg(feature = "unidic")] +fn main() -> tantivy::Result<()> { + use tantivy::collector::TopDocs; + use tantivy::query::QueryParser; + use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; + use tantivy::{doc, Index}; + + use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType}; + use lindera_core::viterbi::{Mode, Penalty}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + + // create schema builder + let mut schema_builder = Schema::builder(); + + // add id field + let id = schema_builder.add_text_field( + "id", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic), + ) + .set_stored(), + ); + + // add title field + let title = schema_builder.add_text_field( + "title", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ja") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // add body field + let body = schema_builder.add_text_field( + "body", + TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("lang_ja") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(), + ); + + // build schema + let schema = schema_builder.build(); + + // create index on memory + let index = Index::create_in_ram(schema.clone()); + + let config = TokenizerConfig { + dict_type: DictionaryType::Unidic, + dict_path: None, + user_dict_path: None, + user_dict_type: UserDictionaryType::Csv, + mode: Mode::Decompose(Penalty::default()), + }; + + // register Lindera tokenizer + index + .tokenizers() + .register("lang_ja", LinderaTokenizer::with_config(config).unwrap()); + + // create index writer + let mut index_writer = index.writer(50_000_000)?; + + // add document + index_writer.add_document(doc!( + id => "1", + title => "成田国際空港", + body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "2", + title => "東京国際空港", + body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" + )).unwrap(); + + // add document + index_writer.add_document(doc!( + id => "3", + title => "関西国際空港", + body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" + )).unwrap(); + + // commit + index_writer.commit()?; + + // create reader + let reader = index.reader()?; + + // create searcher + let searcher = reader.searcher(); + + // create querhy parser + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // parse query + let query_str = "東京"; + let query = query_parser.parse_query(query_str)?; + println!("Query String: {}", query_str); + + // search + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + println!("Search Result:"); + for (_, doc_address) in top_docs { + let retrieved_doc = searcher.doc(doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + + Ok(()) +} + +#[cfg(not(feature = "unidic"))] +fn main() -> tantivy::Result<()> { + Ok(()) +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ea0f2c3..6f56534 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -6,66 +6,6 @@ use lindera_core::LinderaResult; use crate::stream::LinderaTokenStream; /// Tokenize text with the specified mode and dictionary. -/// -/// Example: `すもももももももものうち` would be tokenized as (mode: "normal", dict: "") -/// -/// | Term | すもも | も | もも | も | もも | の | うち | -/// |----------|--------|--------|--------|--------|--------|--------|--------| -/// | Position | 0 | 1 | 2 | 3 | 4 | 5 | 6 | -/// | Offsets | 0,9 | 9,12 | 12,18 | 18,21 | 21,27 | 27,30 | 30,36 | -/// -/// # Example -/// -/// ```rust -/// use lindera_tantivy::tokenizer::*; -/// use tantivy::tokenizer::Tokenizer; -/// -/// let tokenizer = LinderaTokenizer::new().unwrap(); -/// let mut stream = tokenizer.token_stream("すもももももももものうち"); -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "すもも"); -/// assert_eq!(token.offset_from, 0); -/// assert_eq!(token.offset_to, 9); -/// } -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "も"); -/// assert_eq!(token.offset_from, 9); -/// assert_eq!(token.offset_to, 12); -/// } -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "もも"); -/// assert_eq!(token.offset_from, 12); -/// assert_eq!(token.offset_to, 18); -/// } -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "も"); -/// assert_eq!(token.offset_from, 18); -/// assert_eq!(token.offset_to, 21); -/// } -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "もも"); -/// assert_eq!(token.offset_from, 21); -/// assert_eq!(token.offset_to, 27); -/// } -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "の"); -/// assert_eq!(token.offset_from, 27); -/// assert_eq!(token.offset_to, 30); -/// } -/// { -/// let token = stream.next().unwrap(); -/// assert_eq!(token.text, "うち"); -/// assert_eq!(token.offset_from, 30); -/// assert_eq!(token.offset_to, 36); -/// } -/// assert!(stream.next().is_none()); -/// ``` pub struct LinderaTokenizer { pub tokenizer: LTokenizer, } @@ -109,10 +49,16 @@ impl Tokenizer for LinderaTokenizer { } #[cfg(test)] +#[cfg(feature = "ipadic")] mod tests { - use crate::tokenizer::LinderaTokenizer; use tantivy::tokenizer::{BoxTokenStream, Token, Tokenizer}; + use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType}; + use lindera_core::viterbi::Mode; + use lindera_core::viterbi::Penalty; + + use crate::tokenizer::LinderaTokenizer; + fn test_helper(mut tokenizer: BoxTokenStream) -> Vec { let mut tokens: Vec = vec![]; tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); @@ -121,8 +67,16 @@ mod tests { #[test] fn test_tokenizer_equal() { + let config = TokenizerConfig { + dict_type: DictionaryType::Ipadic, + dict_path: None, + user_dict_path: None, + user_dict_type: UserDictionaryType::Csv, + mode: Mode::Decompose(Penalty::default()), + }; + let tokens = test_helper( - LinderaTokenizer::new() + LinderaTokenizer::with_config(config) .unwrap() .token_stream("すもももももももものうち"), );