Skip to content

Commit

Permalink
feat: add collect
Browse files Browse the repository at this point in the history
  • Loading branch information
baerwang committed Jan 11, 2024
1 parent ab23226 commit 93dd588
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 3 deletions.
86 changes: 86 additions & 0 deletions src/handler/collect.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use std::collections::HashSet;
use std::sync::Arc;

use headless_chrome::Tab;

const CONTENT_TYPE: [&str; 14] = [
"application/x-www-form-urlencoded",
"text/plain",
"text/html",
"application/xml",
"text/xml",
"application/json",
"text/javascript",
"multipart/form-data",
"application/octet-stream",
"text/css",
"image/x-icon",
"image/jpeg",
"image/png",
"image/gif",
];
const HREF_ATTRIBUTES: [&str; 4] = ["src", "href", "data-url", "data-href"];

pub fn collect(tab: &Arc<Tab>) {
_ = href(tab);
_ = object(tab);
}

fn href(tab: &Arc<Tab>) -> Result<(), Box<dyn std::error::Error>> {
let node_id = tab.get_document()?.node_id;
let mut set: HashSet<String> = HashSet::new();

for href in HREF_ATTRIBUTES {
let result = tab.run_query_selector_all_on_node(node_id, format!("[{}]", href).as_str())?;

for e in result {
if let Some(attributes) = e.attributes {
for (index, attribute) in attributes.iter().enumerate().filter(|(i, _)| i % 2 == 0)
{
let name = attribute.as_str();
let value = attributes
.get(index + 1)
.map_or("", |v| v.as_str())
.to_string();

if name == "type" && CONTENT_TYPE.iter().any(|t| value.starts_with(t)) {
continue;
}

if name == "pluginspage" || name == href {
set.insert(value);
}
}
}
}
}

log::info!("{:?}", set);

Ok(())
}

fn object(tab: &Arc<Tab>) -> Result<(), Box<dyn std::error::Error>> {
let node_id = tab.get_document()?.node_id;
let mut set: HashSet<String> = HashSet::new();

let result = tab.run_query_selector_all_on_node(node_id, "object[data]")?;
for e in result {
if let Some(attributes) = e.attributes {
for (index, attribute) in attributes.iter().enumerate().filter(|(i, _)| i % 2 == 0) {
let name = attribute.as_str();
let value = attributes
.get(index + 1)
.map_or("", |v| v.as_str())
.to_string();
if name == "data" {
set.insert(value);
}
}
}
}

log::info!("{:?}", set);

Ok(())
}
6 changes: 3 additions & 3 deletions src/handler/crawler.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use std::sync::Arc;
use std::thread::sleep;
use std::time::Duration;

use headless_chrome::protocol::cdp::types::Event;
use headless_chrome::protocol::cdp::Network::ResourceType;
Expand All @@ -12,6 +10,7 @@ use headless_chrome::protocol::cdp::Runtime::{AddBinding, Evaluate};
use headless_chrome::{Browser, Tab};
use tokio::sync::mpsc;

use crate::handler::collect::collect;
use crate::handler::form::{Html, FORM};
use crate::handler::form_js::{JS_CODE, TAB_INIT};
use crate::{common, model};
Expand Down Expand Up @@ -48,9 +47,9 @@ pub fn tasks(
download_path: None,
})?;
tab.navigate_to(url)?;
tab.wait_until_navigated()?;
let tab_clone = Arc::clone(&tab);
event_listener(&tab, tab_clone, tx)?;
sleep(Duration::from_secs(1));
let result = tab.call_method(evaluate())?;
if let Some(result_value) = result.result.value {
let list: Vec<Html> =
Expand All @@ -63,6 +62,7 @@ pub fn tasks(
}
}
}
collect(&tab);
_ = tab.close(true);

Ok(())
Expand Down
1 change: 1 addition & 0 deletions src/handler/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod collect;
pub mod crawler;
pub mod duplicate;
pub mod form;
Expand Down

0 comments on commit 93dd588

Please sign in to comment.