From eeb9a32523fab854ff1a3d3585b16ab4243ad13e Mon Sep 17 00:00:00 2001 From: delorean Date: Fri, 24 May 2024 10:42:52 -0500 Subject: [PATCH] implemented mmh3 favicon hashing --- Cargo.lock | 8 ++++++ Cargo.toml | 4 ++- src/common/conf.rs | 4 +++ src/common/console.rs | 12 +++++++-- src/common/exec.rs | 1 + src/common/modules.rs | 63 ++++++++++++++++++++++++++++++++++++++++++- src/common/net.rs | 10 ++++++- 7 files changed, 97 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3e4cb00..68d2749 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -730,6 +730,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "native-tls" version = "0.2.11" @@ -1225,10 +1231,12 @@ dependencies = [ name = "speedboat" version = "0.1.0" dependencies = [ + "base64", "chrono", "clap", "colored", "futures", + "murmur3", "reqwest", "select", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 0a73875..f2c229b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,10 +6,12 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +base64 = "0.22.1" chrono = "0.4.38" clap = { version = "4.5.4", features = ["derive"] } colored = "2.1.0" futures = "0.3.30" +murmur3 = "0.5.2" reqwest = "0.12.4" select = "0.6.0" -tokio = { version = "1", features = ["full"] } \ No newline at end of file +tokio = { version = "1", features = ["full"] } diff --git a/src/common/conf.rs b/src/common/conf.rs index 622b96b..7a0225c 100644 --- a/src/common/conf.rs +++ b/src/common/conf.rs @@ -42,6 +42,10 @@ pub struct Config { /// read n bytes of the response document body pub bodysize: usize, + #[clap(long = "favicon")] + /// computes an mmh3 favicon hash + pub favicon: bool, + #[clap(long = "ts")] /// include timestamps of requests pub timestamps: bool, diff --git a/src/common/console.rs b/src/common/console.rs index 4ca4f9e..402a46d 100644 --- a/src/common/console.rs +++ b/src/common/console.rs @@ -34,7 +34,7 @@ pub fn fmtcode(code: u16) -> ColoredString { pub fn tstamp() -> String { let date = Local::now(); - let datestr = format!("{}", date.format("[%Y-%m-%d][%H:%M:%S}")); + let datestr = format!("{}", date.format("[%Y-%m-%d][%H:%M:%S]")); format!("{}", datestr.bright_blue()) } @@ -56,7 +56,15 @@ pub fn parsebody(s: String) -> String { ) } -pub fn parsehit(sc: u16, url: String) -> String { +pub fn parsehash(h: i32) -> String { + format!("{}{}{}", + "favicon[".bright_black().bold(), + h.to_string().blue().bold(), + "]".bright_black().bold() + ) +} + +pub fn parsehit(sc: u16, url: &String) -> String { format!( "{} {} {}", fmtcode(sc), diff --git a/src/common/exec.rs b/src/common/exec.rs index 9b08d98..d1ff556 100644 --- a/src/common/exec.rs +++ b/src/common/exec.rs @@ -31,6 +31,7 @@ pub async fn takeoff(args: Config, params: Params) { scodes, params.exclude, args.pulltitles, + args.favicon, args.timestamps, args.bodysize ) diff --git a/src/common/modules.rs b/src/common/modules.rs index 1496c4a..86c19ff 100644 --- a/src/common/modules.rs +++ b/src/common/modules.rs @@ -1,6 +1,11 @@ use select::{document::Document, predicate::Name}; +use reqwest::Url; +use base64::{Engine as _, engine::general_purpose}; +use murmur3::murmur3_32; +use std::{error::Error, +io::Cursor}; -use super::console::{parsetitle, parsebody, fmtwhitespace}; +use super::console::{parsetitle, parsebody, parsehash, fmtwhitespace}; pub fn get_title(body: &String) -> String { let document = Document::from(body.as_str()); @@ -29,4 +34,60 @@ pub fn read_body(body: &String, lim: usize) -> String { } parsebody(bodytext) +} + +fn fmtbase64(s: &str, interval: usize, sep: char) -> String { + let mut out = String::with_capacity(s.len() + s.len() / interval); + let mut count = 0; + + for (_, c) in s.chars().enumerate() { + out.push(c); + count += 1; + if count == interval { + out.push(sep); + count = 0; + } + } + if count != 0 { + out.push(sep); + } + + out +} + +fn faviconurl(doc: Document, url: String ) -> Result> { + for node in doc.find(Name("link")) { + if let Some(rel) = node.attr("rel") { + if rel.eq("icon") { + if let Some(href) = node.attr("href") { + let base_url = Url::parse(&url)?; + let favicon_url = base_url.join(href)?; + return Ok(favicon_url.to_string()); + } + } + } + } + + Err("".into()) +} + +async fn dl_favicon(url: String) -> Result, Box> { + let data = reqwest::get(url).await?.bytes().await?.to_vec(); + Ok(data) +} + +pub async fn hash_favicon(body: &String, url: String) -> String { + let document = Document::from(body.as_str()); + + if let Ok(favurl) = faviconurl(document, url) { + if let Ok(data) = dl_favicon(favurl).await { + // compute hash + let b64 = general_purpose::STANDARD.encode(data); + let f_b64 = fmtbase64(&b64, 76,'\n'); + let hash = murmur3_32(&mut Cursor::new(f_b64.into_bytes()), 0).unwrap_or_else(|_| 0) as i32; + return parsehash(hash); + } + } + + "".into() } \ No newline at end of file diff --git a/src/common/net.rs b/src/common/net.rs index 0949a88..7fca2c6 100644 --- a/src/common/net.rs +++ b/src/common/net.rs @@ -30,6 +30,7 @@ pub async fn query( codes: Vec, exclude: bool, titles: bool, + favicon: bool, timestamps: bool, bodysize: usize, ) -> Result<(), reqwest::Error> { @@ -56,7 +57,7 @@ pub async fn query( let url: String = response.url().to_string(); let body = response.text().await?; - let mut out = parsehit(sc, url); + let mut out = parsehit(sc, &url); if timestamps { out = format!("{} {}", tstamp(), out); @@ -66,6 +67,13 @@ pub async fn query( out = format!("{} {}", out, get_title(&body)); } + if favicon { + let hash = hash_favicon(&body, url).await; + if !hash.is_empty() { + out = format!("{} {}", out, hash); + } + } + if bodysize > 0 { out = format!("{} {}", out, read_body(&body, bodysize)); }