-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Search: Add methods for updating search indicies when notes update
- Loading branch information
1 parent
e536e40
commit 5ade237
Showing
11 changed files
with
257 additions
and
304 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,30 @@ | ||
use std::path::Path; | ||
use std::{path::Path, collections::HashMap}; | ||
|
||
use crate::tokenizer::tokenize; | ||
|
||
pub(crate) mod archive; | ||
pub(crate) mod notebook; | ||
|
||
pub(crate) trait Proccessor { | ||
fn load(&mut self, location: &Path); | ||
} | ||
pub type DocTokenCount = HashMap<String, f32>; | ||
|
||
pub fn tokenize_document(content: String) -> DocTokenCount { | ||
let mut token_counter: DocTokenCount = HashMap::new(); | ||
let mut total_tokens = 0.0; | ||
for line in content.lines() { | ||
let raw_tokens = tokenize(line); | ||
total_tokens += raw_tokens.len() as f32; | ||
for token in raw_tokens { | ||
token_counter | ||
.entry(token) | ||
.and_modify(|v| *v += 1.0) | ||
.or_insert(1.0); | ||
} | ||
} | ||
for (_, val) in token_counter.iter_mut() { | ||
*val /= total_tokens; | ||
} | ||
token_counter | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,42 @@ | ||
use super::Proccessor; | ||
use crate::{tokenizer::tokenize, Tokens}; | ||
use super::{Proccessor, tokenize_document}; | ||
use crate::Tokens; | ||
use persistance::fs::path_to_string; | ||
use serde::{Deserialize, Serialize}; | ||
use std::{collections::HashMap, fs::read_dir, path::Path}; | ||
|
||
#[derive(Default, Debug, Serialize, Deserialize)] | ||
pub(crate) struct Notebook { | ||
pub(crate) tokens: Tokens, | ||
// filename, Vec<search_terms> | ||
pub(crate) file_index: HashMap<String, Vec<String>>, | ||
} | ||
|
||
impl Proccessor for Notebook { | ||
fn load(&mut self, location: &Path) { | ||
let mut tokens: Tokens = HashMap::new(); | ||
let mut doc_token_counter: HashMap<String, f32> = HashMap::new(); | ||
// For some reason using tokio::read_dir never returns in the while loop | ||
let mut term_index: HashMap<String, Vec<String>> = HashMap::new(); | ||
let entries = read_dir(location).unwrap(); | ||
entries.for_each(|entry| { | ||
let entry = entry.unwrap(); | ||
if let Some(fname) = entry.file_name().to_str() { | ||
if fname.ends_with(".txt") { | ||
let title = fname.strip_suffix(".txt").unwrap(); | ||
let content = path_to_string(&entry.path()).unwrap(); | ||
let mut total_tokens = 0; | ||
for line in content.lines() { | ||
let raw_tokens = tokenize(line); | ||
total_tokens += raw_tokens.len(); | ||
for token in raw_tokens { | ||
doc_token_counter | ||
.entry(token) | ||
.and_modify(|v| *v += 1.) | ||
.or_insert(1.); | ||
} | ||
} | ||
for (term, count) in doc_token_counter.iter() { | ||
let doc_token_counter = tokenize_document(content); | ||
for (term, score) in doc_token_counter.iter() { | ||
tokens | ||
.entry(term.to_owned()) | ||
.and_modify(|v| v.push((title.to_string(), *count / total_tokens as f32))) | ||
.or_insert(vec![(title.to_string(), *count / total_tokens as f32)]); | ||
.and_modify(|v| v.push((title.to_string(), *score))) | ||
.or_insert(vec![(title.to_string(), *score)]); | ||
term_index | ||
.entry(fname.to_owned()) | ||
.and_modify(|v| v.push(term.clone())) | ||
.or_insert(vec![term.clone()]); | ||
} | ||
doc_token_counter.clear(); | ||
} | ||
} | ||
}); | ||
self.tokens = tokens; | ||
self.file_index = term_index; | ||
} | ||
} |
Oops, something went wrong.