Skip to content

Commit

Permalink
feat: data collected by production
Browse files Browse the repository at this point in the history
  • Loading branch information
baerwang committed Jan 15, 2024
1 parent a1062df commit 7b3311c
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 26 deletions.
8 changes: 6 additions & 2 deletions src/channel/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::collections::HashSet;

use headless_chrome::Browser;
use tokio::sync::mpsc::Sender;

Expand All @@ -7,6 +9,7 @@ pub struct GlobalState {
pub domain: String,
pub browser: Browser,
pub config: TaskConfig,
pub store: HashSet<String>,

pub sender: Option<Sender<String>>,
}
Expand All @@ -17,13 +20,14 @@ impl GlobalState {
domain,
browser,
config,
store: HashSet::new(),
sender: Some(tx),
}
}

pub fn send_message(&self, message: &str) {
pub async fn send_message(&self, message: &str) {
if let Some(ref sender) = self.sender {
if sender.blocking_send(message.to_owned()).is_err() {
if sender.send(message.to_owned()).await.is_err() {
log::error!("Failed to send URL through channel");
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ pub struct CLi {
/// Authenticate password
#[arg(short, long)]
pub password: Option<String>,
/// DEBUG ERROR WARN
#[arg(short, long, default_value = "INFO", action = Set)]
pub log_level: String,
#[command(subcommand)]
pub opt: Option<Opt>,
}
Expand Down
8 changes: 3 additions & 5 deletions src/cli/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pub async fn cli() -> Result<(), Box<dyn std::error::Error>> {
repeat: duplicate_factory,
};

env_logger::init_from_env(env_logger::Env::new().default_filter_or("INFO"));
env_logger::init_from_env(env_logger::Env::new().default_filter_or(app.log_level));

common::load("user_agent", "files/user_agent.toml");
common::load("form", "files/form.toml");
Expand Down Expand Up @@ -115,17 +115,15 @@ pub async fn cli() -> Result<(), Box<dyn std::error::Error>> {

let set: DashSet<String> = DashSet::new();
let browser = Browser::new(launch_options)?;
let state = channel::GlobalState::new(
let mut state = channel::GlobalState::new(
tx.clone(),
Arc::new(app.target[0].clone()).clone().to_string(),
browser,
config,
);
while let Some(url) = rx.recv().await {
if set.insert(url.clone()) {
_ = crawler::tasks(url.clone().as_str(), tx.clone(), &state);
} else {
println!("Value {} already exists", url.clone());
_ = crawler::tasks(url.clone().as_str(), tx.clone(), &mut state).await;
}
}

Expand Down
29 changes: 15 additions & 14 deletions src/handler/collect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,28 @@ const JS_OBJECT: &str = r#"
list
"#;

pub fn collect(state: &channel::GlobalState, tab: &Arc<Tab>) {
_ = query_selector_all(state, tab, JS_HREF);
_ = query_selector_all(state, tab, JS_OBJECT);
pub async fn collect(state: &mut channel::GlobalState, tab: &Arc<Tab>) {
_ = query_selector_all(state, tab, JS_HREF).await;
_ = query_selector_all(state, tab, JS_OBJECT).await;
}

fn query_selector_all(
state: &channel::GlobalState,
async fn query_selector_all(
state: &mut channel::GlobalState,
tab: &Arc<Tab>,
v: &str,
) -> Result<HashSet<String>, Box<dyn Error>> {
) -> Result<(), Box<dyn Error>> {
let result = tab.call_method(util::evaluate(v))?;
if let Some(result_value) = result.result.value {
return Ok(
serde_json::from_str::<HashSet<String>>(&result_value.to_string())?
.into_iter()
.filter(|s| matching_filter(s))
.map(|v| parse_url(state.domain.to_string(), v))
.collect(),
);
let set = serde_json::from_str::<HashSet<String>>(&result_value.to_string())?;
for s in &set {
if matching_filter(s) && state.store.insert(s.clone()) {
state
.send_message(parse_url(state.domain.to_string(), s.to_string()).as_str())
.await
}
}
}
Ok(HashSet::new())
Ok(())
}

fn parse_url(root: String, child: String) -> String {
Expand Down
10 changes: 5 additions & 5 deletions src/handler/crawler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ use crate::handler::form::{Html, FORM};
use crate::handler::form_js::{JS_CODE, TAB_INIT};
use crate::{channel, common};

pub fn tasks(
pub async fn tasks(
url: &str,
tx: mpsc::Sender<String>,
state: &channel::GlobalState,
state: &mut channel::GlobalState,
) -> Result<(), Box<dyn std::error::Error>> {
let random_ug = common::user_agent::random_user_agent();
let tab = state.browser.new_tab()?;
Expand Down Expand Up @@ -50,7 +50,8 @@ pub fn tasks(
tab.navigate_to(url)?;
tab.wait_until_navigated()?;
let tab_clone = Arc::clone(&tab);
event_listener(&tab, tab_clone, tx)?;
event_listener(&tab, tab_clone, tx).await?;
collect(state, &tab).await;
let result = tab.call_method(util::evaluate(JS_CODE))?;
if let Some(result_value) = result.result.value {
let list: Vec<Html> =
Expand All @@ -63,13 +64,12 @@ pub fn tasks(
}
}
}
collect(state, &tab);
_ = tab.close(true);

Ok(())
}

fn event_listener(
async fn event_listener(
tab: &Arc<Tab>,
tab_clone: Arc<Tab>,
tx: mpsc::Sender<String>,
Expand Down

0 comments on commit 7b3311c

Please sign in to comment.