Skip to content

Commit

Permalink
Merge pull request #55 from Ajk4/multiple_input_files
Browse files Browse the repository at this point in the history
Allow list of input files
  • Loading branch information
piobab authored Jun 24, 2022
2 parents 1481091 + 1657291 commit 53d83e5
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 19 deletions.
6 changes: 3 additions & 3 deletions src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ pub struct Configuration {
/// RAM we can support training with mmap files
pub in_memory_embedding_calculation: bool,

/// Path to the input file
pub input: String,
/// Paths to the input files
pub input: Vec<String>,

/// Type of the input file
pub file_type: FileType,
Expand Down Expand Up @@ -86,7 +86,7 @@ impl Configuration {
log_every_n: 1000,
in_memory_embedding_calculation: true,
file_type: FileType::Tsv,
input,
input: vec![input],
output_dir: None,
output_format: OutputFormat::TextFile,
relation_name: String::from("emb"),
Expand Down
28 changes: 24 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ fn main() {
.version(crate_version!())
.author(crate_authors!())
.about(crate_description!())
.arg(
Arg::new("inputs")
.multiple_values(true)
.help("Input files paths")
.takes_value(true),
)
.arg(
Arg::new("input")
.short('i')
.long("input")
.required(true)
.help("Input file path")
.help("Deprecated. Use positional args for input files")
.takes_value(true),
)
.arg(
Expand Down Expand Up @@ -127,7 +132,22 @@ fn main() {

info!("Reading args...");

let input = matches.value_of("input").unwrap();
let input: Vec<String> = {
let named_arg = matches.value_of("input");
let position_args = match matches.values_of("inputs") {
None => vec![],
Some(values) => values.into_iter().collect(),
};
position_args
.into_iter()
.chain(named_arg.into_iter())
.map(|s| s.to_string())
.collect()
};
if input.is_empty() {
panic!("Missing input files")
}

let file_type = match matches.value_of("file-type") {
Some(type_name) => match type_name {
"tsv" => configuration::FileType::Tsv,
Expand Down Expand Up @@ -192,7 +212,7 @@ fn main() {
prepend_field: prepend_field_name,
log_every_n: log_every,
in_memory_embedding_calculation,
input: input.to_string(),
input,
file_type,
output_dir,
output_format,
Expand Down
26 changes: 15 additions & 11 deletions src/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,20 @@ pub fn build_graphs(
match &config.file_type {
FileType::Json => {
let mut parser = dom::Parser::default();
read_file(config, |line| {
let row = parse_json_line(line, &mut parser, &config.columns);
entity_processor.process_row(&row);
});
for input in config.input.iter() {
read_file(input, config.log_every_n as u64, |line| {
let row = parse_json_line(line, &mut parser, &config.columns);
entity_processor.process_row(&row);
});
}
}
FileType::Tsv => {
read_file(config, |line| {
let row = parse_tsv_line(line);
entity_processor.process_row(&row);
});
for input in config.input.iter() {
read_file(input, config.log_every_n as u64, |line| {
let row = parse_tsv_line(line);
entity_processor.process_row(&row);
});
}
}
}

Expand All @@ -77,11 +81,11 @@ pub fn build_graphs(
}

/// Read file line by line. Pass every valid line to handler for parsing.
fn read_file<F>(config: &Configuration, mut line_handler: F)
fn read_file<F>(filepath: &str, log_every: u64, mut line_handler: F)
where
F: FnMut(&str),
{
let input_file = File::open(&config.input).expect("Can't open file");
let input_file = File::open(filepath).expect("Can't open file");
let mut buffered = BufReader::new(input_file);

let mut line_number = 1u64;
Expand All @@ -104,7 +108,7 @@ where
// clear to reuse the buffer
line.clear();

if line_number % config.log_every_n as u64 == 0 {
if line_number % log_every == 0 {
info!("Number of lines processed: {}", line_number);
}

Expand Down
2 changes: 1 addition & 1 deletion tests/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ fn prepare_config() -> Configuration {
prepend_field: false,
log_every_n: 10000,
in_memory_embedding_calculation: true,
input: "files/samples/edgelist_1.tsv".to_string(),
input: vec!["files/samples/edgelist_1.tsv".to_string()],
file_type: FileType::Tsv,
output_format: OutputFormat::TextFile,
output_dir: None,
Expand Down

0 comments on commit 53d83e5

Please sign in to comment.