Skip to content

Commit

Permalink
interlinker: documentation and minor tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
paoloose committed Nov 28, 2023
1 parent 9b73f1c commit b405a53
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 21 deletions.
47 changes: 47 additions & 0 deletions assignments/project3/interlinker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<p align="center">
<h2 align="center">Interlinker</h2>
<p align="center">A real-time web visualizer based on crawling and graph theory</p>
<p align="center">
<img src="./assets/banner.png" width="400" />
</p>
</p>

Running live at <https://www.paoloose.site/discmaths/project/3/>

Part of the [series of Discrete Mathematics projects](https://paoloose.site/discmaths).

What if you can see how the web is created?

It's impossible to map the entire internet, but what if we can generate an small portion of it?

## How it works?

The concept behind Interlinker is really simple:

- You seed the crawler with a url ('the origin')
- It will start crawling that page, looking for links (`<a/>` tags)
- The links found will be added to a queue
- It will repeat the process for each link in the queue

You can feed multiple origins to the crawler, and it will crawl them in parallel!

## Gallery

The Interlinker's graph seeded with <https://paoloose.site> after:

3 minutes of crawling:

![3 minutes of Interlinker](assets/3min.png)

20 minutes of crawling:

![20 minutes of Interlinker](assets/20min.jpeg)

3 hours of crawling:

![3 minutes of Interlinker](assets/3hours.jpeg)

## Acknowledgements

- Graph visualization: <https://github.com/vasturiano/force-graph>
- WebSocket implementation: <https://github.com/seanmonstar/warp>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assignments/project3/interlinker/assets/3min.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 12 additions & 11 deletions assignments/project3/interlinker/src/ws.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::collections::HashSet;
use std::sync::atomic::{AtomicUsize, Ordering};
use quick_xml::name::QName;
use tokio::sync::mpsc as tokio_mpsc;
use tokio_stream::wrappers::UnboundedReceiverStream;
use warp::filters::ws::{WebSocket, Message};
Expand Down Expand Up @@ -162,20 +163,20 @@ async fn handle_msg(client: &mut Client, msg: Message, _clients: &Clients) -> Re
match reader.read_event_into_async(&mut buf).await {
Ok(Event::Start(e)) => {
println!("- {e:?}");
let tag = e.name().into_inner();
if tag != b"a" { continue; }
let attrs = e.html_attributes()
.find(|attr| attr.as_ref().is_ok_and(|attr| attr.key.into_inner() == b"href"))
.map(|attr| String::from_utf8(attr.unwrap().value.to_vec()).unwrap());

let href = match attrs {
Some(href) => {
if !href.starts_with("http") || !is_valid_url(href.as_str()) {
let tag = e.name();
if tag != QName(b"a") { continue; }
let attr = e.html_attributes()
.find(|attr| attr.as_ref().is_ok_and(|attr| attr.key == QName(b"href")))
.map(|attr| attr.unwrap().value.to_vec());

let href = match attr.map(|attr| String::from_utf8(attr)) {
Some(Ok(href)) => {
if !href.starts_with("http") || !is_valid_url(&href) {
continue;
}
href
},
None => continue,
_ => continue,
};

let domain_to_visit = match extract_domain(&href) {
Expand All @@ -201,7 +202,7 @@ async fn handle_msg(client: &mut Client, msg: Message, _clients: &Clients) -> Re
}
buf.clear();
}
if client.active_origins.read().await.iter().find(|x| **x == origin).is_none() {
if !client.active_origins.read().await.contains(&origin) {
break;
}
}
Expand Down
8 changes: 0 additions & 8 deletions frontend/src/components/Interlinker/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,15 +227,7 @@ ws.addEventListener('message', (m) => {
return;
}
switch (msg.type) {
case 'UrlMessage': {
const { is_invalid } = msg;
if (is_invalid) {
console.log(`invalid input: ${$input.value}`)
}
break;
}
case 'FinishMessage': {
console.log('finish');
const { origin } = msg;
activeOrigins = activeOrigins.filter((o) => o !== origin);
displayActiveOrigins();
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/layouts/Layout.astro
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ const { title } = Astro.props;
width: 8px;
}
*::-webkit-scrollbar-track {
background: #bebebe;
background: #757575;
}
*::-webkit-scrollbar-thumb {
background: #757575;
background: #bebebe;
}
</style>

0 comments on commit b405a53

Please sign in to comment.