Skip to content

Commit

Permalink
Update Lindera and Tantivy (#37)
Browse files Browse the repository at this point in the history
* Update Lindera and Tantivy

* Update CHANGES.md

* Update workflows

* Update examples and bench

* Fix lint

* Remove example

* Update test
  • Loading branch information
mosuka authored Mar 9, 2022
1 parent d97a542 commit d2c45c7
Show file tree
Hide file tree
Showing 12 changed files with 467 additions and 107 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ jobs:
matrix:
os: [ubuntu-latest, macOS-latest, windows-latest]
toolchain: [stable, beta, nightly]
features: ["ipadic", "ko-dic", "cc-cedict"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v1
Expand All @@ -22,3 +23,4 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: test
args: --features "${{ matrix.features }}"
2 changes: 2 additions & 0 deletions .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
matrix:
os: [ubuntu-latest, macOS-latest, windows-latest]
toolchain: [stable]
features: ["ipadic", "ko-dic", "cc-cedict"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v1
Expand All @@ -43,6 +44,7 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: test
args: --features "${{ matrix.features }}"

fmt:
name: Format
Expand Down
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## 0.11.1 (2022-03-09)
- Update Lindera and Tantivy #37 @mosuka

## 0.10.0 (2022-02-25)
- Update lindera to 0.10.0 #32 @mosuka

Expand Down
18 changes: 13 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[package]
name = "lindera-tantivy"
version = "0.10.0"
version = "0.11.1"
edition = "2021"
description = "A Tokenizer for Tantivy, based on Lindera."
description = "Lindera Tokenizer for Tantivy."
documentation = "https://docs.rs/lindera-tantivy"
homepage = "https://github.com/lindera-morphology/lindera-tantivy"
repository = "https://github.com/lindera-morphology/lindera-tantivy"
Expand All @@ -11,11 +11,19 @@ keywords = ["tokenizer", "tantivy", "lindera"]
categories = ["text-processing"]
license = "MIT"

[features]
default = []
full = ["ipadic", "unidic", "ko-dic", "cc-cedict"]
ipadic = ["lindera/ipadic"] # Japanese dictionary
unidic = ["lindera/unidic"] # Japanese dictionary
ko-dic = ["lindera/ko-dic"] # Korean dictionary
cc-cedict = ["lindera/cc-cedict"] # Chinese dictionary

[dependencies]
tantivy = "0.16"
tantivy = "0.17"

lindera = "0.10.0"
lindera-core = "0.10.0"
lindera = "0.11.1"
lindera-core = "0.11.1"

[dev-dependencies]
criterion = "0.3"
Expand Down
3 changes: 0 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ clean:
format:
cargo fmt

build:
cargo build --release

test:
cargo test

Expand Down
32 changes: 19 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@

[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Join the chat at https://gitter.im/lindera-morphology/lindera](https://badges.gitter.im/lindera-morphology/lindera.svg)](https://gitter.im/lindera-morphology/lindera?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)

A Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy), based on [Lindera](https://github.com/lindera-morphology/lindera).
[Lindera](https://github.com/lindera-morphology/lindera) Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy).

## Build

The following products are required to build:
## Usage

- Rust >= 1.46.0
Make sure you have activated the required dictionaries for the  Lindera in Cargo.toml.
The following example enables IPADIC.

```text
% cargo build --release
```
[dependencies]
lindera = { version = "0.11.1", features = ["ipadic"] }
```

## Usage
- ipadic: Japanese dictionary
- unidic: Japanese dictionary
- ko-dic: Korean dictionary
- cc-cedict: Chinese dictionary


### Basic example

Expand All @@ -24,7 +29,7 @@ use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::{doc, Index};

use lindera::tokenizer::{TokenizerConfig, UserDictionaryType};
use lindera::tokenizer::{TokenizerConfig, UserDictionaryType, DictionaryType};
use lindera_core::viterbi::{Mode, Penalty};
use lindera_tantivy::tokenizer::LinderaTokenizer;

Expand Down Expand Up @@ -75,9 +80,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());

let config = TokenizerConfig {
dict_type: DictionaryType::Ipadic,
dict_path: None,
user_dict_path: None,
user_dict_type: UserDictionaryType::CSV,
user_dict_type: UserDictionaryType::Csv,
mode: Mode::Decompose(Penalty::default()),
};

Expand All @@ -93,22 +99,22 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(
id => "1",
title => "成田国際空港",
body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である[1]。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
));
body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "2",
title => "東京国際空港",
body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
));
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "3",
title => "関西国際空港",
body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
));
)).unwrap();

// commit
index_writer.commit()?;
Expand Down
28 changes: 17 additions & 11 deletions benches/bench.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
use criterion::Criterion;
use criterion::{criterion_group, criterion_main};
use tantivy::doc;
use tantivy::schema::IndexRecordOption;
use tantivy::schema::Schema;
use tantivy::schema::TextFieldIndexing;
use tantivy::schema::TextOptions;
use tantivy::Index;

use lindera::tokenizer::{TokenizerConfig, UserDictionaryType};
use lindera_core::viterbi::{Mode, Penalty};
use lindera_tantivy::tokenizer::LinderaTokenizer;

#[cfg(feature = "ipadic")]
fn bench_indexing(c: &mut Criterion) {
use tantivy::doc;
use tantivy::schema::IndexRecordOption;
use tantivy::schema::Schema;
use tantivy::schema::TextFieldIndexing;
use tantivy::schema::TextOptions;
use tantivy::Index;

use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType};
use lindera_core::viterbi::{Mode, Penalty};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();

Expand Down Expand Up @@ -46,9 +48,10 @@ fn bench_indexing(c: &mut Criterion) {
let index = Index::create_in_ram(schema.clone());

let config = TokenizerConfig {
dict_type: DictionaryType::Ipadic,
dict_path: None,
user_dict_path: None,
user_dict_type: UserDictionaryType::CSV,
user_dict_type: UserDictionaryType::Csv,
mode: Mode::Decompose(Penalty::default()),
};

Expand Down Expand Up @@ -78,5 +81,8 @@ fn bench_indexing(c: &mut Criterion) {
group.finish();
}

#[cfg(not(feature = "ipadic"))]
fn bench_indexing(_c: &mut Criterion) {}

criterion_group!(benches, bench_indexing,);
criterion_main!(benches);
125 changes: 125 additions & 0 deletions examples/cc-cedict_example.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#[cfg(feature = "cc-cedict")]
fn main() -> tantivy::Result<()> {
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::{doc, Index};

use lindera::tokenizer::{DictionaryType, TokenizerConfig, UserDictionaryType};
use lindera_core::viterbi::{Mode, Penalty};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();

// add id field
let id = schema_builder.add_text_field(
"id",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic),
)
.set_stored(),
);

// add title field
let title = schema_builder.add_text_field(
"title",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("lang_zh")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);

// add body field
let body = schema_builder.add_text_field(
"body",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("lang_zh")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);

// build schema
let schema = schema_builder.build();

// create index on memory
let index = Index::create_in_ram(schema.clone());

let config = TokenizerConfig {
dict_type: DictionaryType::Cedict,
dict_path: None,
user_dict_path: None,
user_dict_type: UserDictionaryType::Csv,
mode: Mode::Decompose(Penalty::default()),
};

// register Lindera tokenizer
index
.tokenizers()
.register("lang_zh", LinderaTokenizer::with_config(config).unwrap());

// create index writer
let mut index_writer = index.writer(50_000_000)?;

// add document
index_writer.add_document(doc!(
id => "1",
title => "成田国际机场",
body => "成田國際機場(日语:成田国際空港/なりたこくさいくうこう Narita Kokusai Kūkō */?;IATA代码:NRT;ICAO代码:RJAA),通稱成田機場(成田空港),原名新東京國際機場(新東京国際空港/しんとうきょうこくさいくうこう Shin-Tōkyō Kokusai Kūkō),是位於日本千葉縣成田市的國際機場,與羽田機場並列為東京兩大聯外機場。占地1,111公頃,擁有3座客運航廈,客運流量居日本第二位,貨運吞吐量則居日本第一、全球第九。根據日本機場分類法,其劃分為據點機場。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "2",
title => "東京國際機場",
body => "東京國際機場(日语:東京国際空港/とうきょうこくさいくうこう Tōkyō Kokusai Kūkō */?;IATA代码:HND;ICAO代码:RJTT)是位於日本東京都大田區的機場,因座落於羽田地區而通稱為羽田機場(羽田空港/はねだくうこう Haneda Kūkō),啟用於1931年8月25日,與成田國際機場並列為東京兩大聯外機場。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "3",
title => "关西国际机场",
body => "關西國際機場(日语:関西国際空港/かんさいこくさいくうこう Kansai kokusai kūkō */?,英語:Kansai International Airport,IATA代码:KIX;ICAO代码:RJBB),常通稱為關西機場、大阪關西機場或關空[註 1],是位於日本大阪府的機場,坐落於大阪湾东南部的泉州近海離岸5公里的人工島上,面積約1,067.7公頃[2],行政區劃橫跨大阪府的泉佐野市(北)、田尻町(中)以及泉南市(南)。"
)).unwrap();

// commit
index_writer.commit()?;

// create reader
let reader = index.reader()?;

// create searcher
let searcher = reader.searcher();

// create querhy parser
let query_parser = QueryParser::for_index(&index, vec![title, body]);

// parse query
let query_str = "東京";
let query = query_parser.parse_query(query_str)?;
println!("Query String: {}", query_str);

// search
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}

Ok(())
}

#[cfg(not(feature = "cc-cedict"))]
fn main() -> tantivy::Result<()> {
Ok(())
}
Loading

0 comments on commit d2c45c7

Please sign in to comment.