Skip to content

Commit

Permalink
fix: bulk download work without interrupt
Browse files Browse the repository at this point in the history
  • Loading branch information
amkhrjee committed Aug 31, 2024
1 parent 786468a commit 721053c
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 29 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
/target
/target
*.pem
*.sh
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "wp"
license = "GPLv3"
version = "0.1.8"
version = "0.1.9"
description = "Wikipedia tools on your terminal."
edition = "2021"
authors = ["Aniruddha Mukherjee <amkhrjee@gmail.com>"]
Expand Down
32 changes: 16 additions & 16 deletions src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,9 @@ fn remove_nested_braces(input: &str) -> String {
let mut chars = input.chars().peekable();
while let Some(c) = chars.next() {
if c == '{' && chars.peek() == Some(&'{') {
// Skip the next character since we found {{
chars.next();
stack += 1;
} else if c == '}' && chars.peek() == Some(&'}') {
// Skip the next character since we found }}
chars.next();
if stack > 0 {
stack -= 1;
Expand Down Expand Up @@ -95,22 +93,24 @@ fn parse_text(characters: &Vec<char>) -> Option<Vec<Token>> {
let mut current = 0;
let mut tokens: Vec<Token> = Vec::new();
let mut is_bullet = false;
// const MAX_ITERATIONS = 1000000;
println!("Charatcers: {}", characters.len());
// Very nutty, bad will have to do for now
const MAX_ITERATINS: i32 = 150000;
let mut iter_count = 0;

while current < characters.len() {
while current < characters.len() && iter_count < MAX_ITERATINS {
iter_count += 1;
match characters[current] {
'{' => {
// Assuming we can only have three levels of nesting
// This is some convoluted shit thanks to wikipedia's format:(
current += 2;
while advance(characters, &mut current) != '}' {
while advance(characters, &mut current, &mut iter_count) != '}' {
if characters[current] == '{' {
current += 1;
while advance(characters, &mut current) != '}' {
while advance(characters, &mut current, &mut iter_count) != '}' {
if characters[current] == '{' {
current += 1;
while advance(characters, &mut current) != '}' {}
while advance(characters, &mut current, &mut iter_count) != '}' {}
current += 1;
}
}
Expand All @@ -123,7 +123,7 @@ fn parse_text(characters: &Vec<char>) -> Option<Vec<Token>> {
if peek_ahead(&characters, current) == '\'' {
let mut apostrophe_count = 0;
let mut format = FormatType::Bold;
while advance(&characters, &mut current) == '\'' {
while advance(&characters, &mut current, &mut iter_count) == '\'' {
apostrophe_count += 1;
}
if apostrophe_count == 2 {
Expand All @@ -141,7 +141,7 @@ fn parse_text(characters: &Vec<char>) -> Option<Vec<Token>> {
}
start = current - 1;

while advance(&characters, &mut current) != '\'' {}
while advance(&characters, &mut current, &mut iter_count) != '\'' {}
add_token(&mut tokens, start, current, format);
current += apostrophe_count - 1;
} else {
Expand All @@ -156,10 +156,10 @@ fn parse_text(characters: &Vec<char>) -> Option<Vec<Token>> {
let mut has_pipe = false;
current += 2;
start = current;
while advance(characters, &mut current) != ']' {
while advance(characters, &mut current, &mut iter_count) != ']' {
if characters[current] == '[' {
has_nesting = true;
while advance(characters, &mut current) != ']' {}
while advance(characters, &mut current, &mut iter_count) != ']' {}
current += 1;
} else if characters[current] == '|' {
has_pipe = true;
Expand All @@ -177,14 +177,14 @@ fn parse_text(characters: &Vec<char>) -> Option<Vec<Token>> {
add_space(&mut tokens, current);
current += 1;
}
'<' => while advance(characters, &mut current) != '>' {},
'<' => while advance(characters, &mut current, &mut iter_count) != '>' {},
'=' => {
let mut equals_count = 0;
while advance(characters, &mut current) == '=' {
while advance(characters, &mut current, &mut iter_count) == '=' {
equals_count += 1;
}
start = current - 1;
while advance(characters, &mut current) != '=' {}
while advance(characters, &mut current, &mut iter_count) != '=' {}
match equals_count {
2 => {
add_token(&mut tokens, start, current, FormatType::Title);
Expand All @@ -208,7 +208,7 @@ fn parse_text(characters: &Vec<char>) -> Option<Vec<Token>> {
} else if characters[current] == '"' {
current += 1;
start = current;
while advance(characters, &mut current) != '\\' {}
while advance(characters, &mut current, &mut iter_count) != '\\' {}
add_token(&mut tokens, start, current, FormatType::InlineQuote);
}
current += 1;
Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ fn main() {
let (plaintext, url_title) = plaintext_from_link(&link);
if args.save {
let mut hasher = DefaultHasher::new();

save_to_disk(&plaintext, &url_title, &mut hasher, false);
} else {
output_to_stdout(&plaintext);
Expand Down
6 changes: 5 additions & 1 deletion src/scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ pub fn bulk_download_or_save_links(
let url = Url::parse(start_url)?;
let main_url = url.host_str().ok_or("Invalid URL")?;

println!("💭 Links will be saved to your current directory as zip.");
println!("⚡ Scraping links...");

let mut next_batch_link = start_url.to_string();
Expand Down Expand Up @@ -136,6 +135,10 @@ pub fn bulk_download_or_save_links(
} else {
// Download straight from the links!
println!("⚡ Proceeding with the downloads...");
match fs::create_dir("wp_downloads") {
Ok(()) => println!("Directory created successfully"),
Err(err) => println!("Error creating directory: {}", err),
}
let dir_path = Path::new(".");
let files: Vec<_> = fs::read_dir(dir_path)?
.filter_map(|entry| entry.ok())
Expand All @@ -144,6 +147,7 @@ pub fn bulk_download_or_save_links(
&& entry.path().extension().and_then(|s| s.to_str()) == Some("links")
})
.collect();

for each_file in files {
let file_path = each_file.path();
match download_from_file(file_path.to_str().unwrap()) {
Expand Down
35 changes: 26 additions & 9 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,19 @@ use regex::Regex;

use crate::{plaintext_from_link, FormatType, Token};

pub fn advance(text: &Vec<char>, current: &mut usize) -> char {
*current += 1;
if *current < text.len() {
return text[*current - 1];
pub fn advance(text: &Vec<char>, current: &mut usize, iter: &mut i32) -> char {
let max = 150000;
if *iter < max {
*iter += 1;
*current += 1;
if *current < text.len() {
return text[*current - 1];
}
return '\0';
} else {
panic!("Infinite loop");
// exit(1);
}
return '\0';
}

pub fn add_token(tokens: &mut Vec<Token>, start: usize, current: usize, format: FormatType) {
Expand Down Expand Up @@ -96,7 +103,13 @@ pub fn save_to_disk(
article_title.hash(hasher);
let hash = hasher.finish();
let hash = format!("{:x}.txt", hash);
let path = Path::new(&hash);
let file_path;
if is_bulk {
file_path = format!("./wp_downloads/{}", hash);
} else {
file_path = format!("{}", hash);
}
let path = Path::new(&file_path);

let mut file = match File::create(&path) {
Err(why) => panic!("Error: Couldn't create {}: {}", path.display(), why),
Expand Down Expand Up @@ -138,7 +151,8 @@ pub fn download_from_file(link: &str) -> Option<bool> {
)));

println!("🔍 Total links found: {}", total_count);
println!("🗃️ Downloading articles in bulk...\n");
println!("🗃️ Downloading articles in bulk in wp_downlods...\n");

for link in list_of_links {
let bar = Arc::clone(&bar);
let handle = spawn(move || {
Expand All @@ -149,10 +163,13 @@ pub fn download_from_file(link: &str) -> Option<bool> {
});
handles.push(handle);
}
// Just pure evil
for handle in handles {
handle.join().unwrap();
match handle.join() {
Ok(_) => (),
Err(err) => println!("Thread returned an error: {:?}", err),
}
}

bar.lock().unwrap().finish_and_clear();

println!("\n✅ Download complete.");
Expand Down

0 comments on commit 721053c

Please sign in to comment.