Skip to content

Commit

Permalink
Merge pull request #7 from Rayahhhmed/SC-2-clean-scraper-code
Browse files Browse the repository at this point in the history
Sc 2 clean scraper code
  • Loading branch information
Rayahhhmed authored Jan 6, 2024
2 parents d61ed1f + b36bc87 commit 7724177
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 105 deletions.
43 changes: 43 additions & 0 deletions src/class_scraper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// pub async fn run_scraper(&mut self) -> Result<(), Box<dyn std::error::Error>> {
// match &self.url {
// Some(url) => {
// let html = self.fetch_url(url).await?;
// println!("{}", html);
// let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap();
// let code_selector = Selector::parse("td.data").unwrap();
// let name_selector = Selector::parse("td.data a").unwrap();
// let link_selector = Selector::parse("td.data a").unwrap();
// let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
// let document = scraper::Html::parse_document(&html);
// for row_node in document.select(&row_selector) {
// // Extract data from each row
// let subject_area_course_code =
// extract_text(row_node.select(&code_selector).next().unwrap());
// let subject_area_course_name =
// extract_text(row_node.select(&name_selector).next().unwrap());
// let url = get_html_link_to_page(
// row_node
// .select(&link_selector)
// .next()
// .map_or("", |node| node.value().attr("href").unwrap_or("")),
// );
// let school = extract_text(row_node.select(&school_selector).next().unwrap());
// // Create a Course struct and push it to the vector
// let page = Page {
// subject_area_course_code,
// subject_area_course_name,
// url,
// school,
// courses: Vec::new(),
// };

// self.add_page(page);

// }

// println!("{:?}", self.pages);
// Ok(())
// }
// None => Err(Box::new(UrlInvalidError)),
// }
// }
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
mod scraper;
mod url_invalid_error;

mod class_scraper;
mod subject_area_scraper;
mod text_manipulators;

pub use scraper::Scraper;
pub use url_invalid_error::UrlInvalidError;
pub use subject_area_scraper::SubjectAreaScraper;
11 changes: 5 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
use spooderman::Scraper;
use spooderman::{Scraper, SubjectAreaScraper};

#[tokio::main]
async fn main() {
let mut scraper =
Scraper::new().set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string());

match scraper.run_scraper().await {
Ok(_res) => {
let mut scraper = SubjectAreaScraper::new()
.set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string());
match scraper.run_scraper_on_url().await {
Ok(_) => {
println!("Scraping successful!\n");
}
Err(e) => eprintln!("Error: {}", e),
Expand Down
File renamed without changes.
135 changes: 36 additions & 99 deletions src/scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use reqwest::ClientBuilder;
use scraper::{html, ElementRef, Selector};
use std::ops::Add;

use crate::UrlInvalidError;
use crate::{UrlInvalidError, subject_area_scraper::SubjectAreaPage};

#[derive(Debug)]
enum Term {
Expand Down Expand Up @@ -107,110 +107,47 @@ pub struct Course {
notes: String,
}

#[derive(Debug)]
pub struct Page {
url: String,
subject_area_course_code: String,
subject_area_course_name: String,
school: String,
courses: Vec<Course>,

pub trait Page {
fn view_page_details(&self);
}
pub trait Scraper {
fn new() -> Self;
fn set_url(&mut self, url: String) -> Self;
fn add_page(&mut self, page: Box::<dyn Page>);
}

#[derive(Debug)]
pub struct Scraper {
url: Option<String>,
pages: Option<Vec<Page>>,
pub async fn fetch_url(url: &str) -> Result<String, Box<dyn std::error::Error>> {
let client = ClientBuilder::new()
.danger_accept_invalid_certs(true)
.build()?;
let response = client.get(url).send().await?;
let body = response.text().await?;
Ok(body)
}
// impl Scraper {


impl Scraper {
pub fn new() -> Self {
Scraper {
url: None,
pages: Some(Vec::new()),
}
}


pub fn set_url(mut self, url: String) -> Self {
self.url = Some(url);
self
}
// pub fn add_page(&mut self, page: impl Page) {
// self.pages.push(Box::new(page));
// }

pub fn add_page(&mut self, page: Page) {
let mut new_pages = self.pages.take().unwrap_or_default();
new_pages.push(page);
self.pages = Some(new_pages);
}
// // pub async fn run_scraper(&mut self) -> Result<(), Box<dyn std::error::Error>> {
// // self.subject_area_scrape().await
// // }
// }

async fn fetch_url(&self, url: &str) -> Result<String, Box<dyn std::error::Error>> {
let client = ClientBuilder::new()
.danger_accept_invalid_certs(true)
.build()?;
let response = client.get(url).send().await?;
let body = response.text().await?;
Ok(body)
}
// impl Scraper {
// pub fn view_scraper(&self) {
// println!("{:?}", self);
// }
// }

pub async fn run_scraper(&mut self) -> Result<(), Box<dyn std::error::Error>> {
match &self.url {
Some(url) => {
let html = self.fetch_url(url).await?;
println!("{}", html);
let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap();
let code_selector = Selector::parse("td.data").unwrap();
let name_selector = Selector::parse("td.data a").unwrap();
let link_selector = Selector::parse("td.data a").unwrap();
let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
let document = scraper::Html::parse_document(&html);
for row_node in document.select(&row_selector) {
// Extract data from each row
let subject_area_course_code =
extract_text(row_node.select(&code_selector).next().unwrap());
let subject_area_course_name =
extract_text(row_node.select(&name_selector).next().unwrap());
let url = get_html_link_to_page(
row_node
.select(&link_selector)
.next()
.map_or("", |node| node.value().attr("href").unwrap_or("")),
);
let school = extract_text(row_node.select(&school_selector).next().unwrap());
// Create a Course struct and push it to the vector
let page = Page {
subject_area_course_code,
subject_area_course_name,
url,
school,
courses: Vec::new(),
};

self.add_page(page);

}

println!("{:?}", self.pages);
Ok(())
}
None => Err(Box::new(UrlInvalidError)),
}
}
}

impl Scraper {
pub fn view_scraper(&self) {
println!("{:?}", self);
}
}

impl Default for Scraper {
fn default() -> Self {
Self::new()
}
}
// impl Default for Scraper {
// fn default() -> Self {
// Self::new()
// }
// }

fn extract_text(node: ElementRef) -> String {
node.text().collect::<String>()
}

fn get_html_link_to_page(html_fragment: &str) -> String {
"https://timetable.unsw.edu.au/2024/".to_string() + html_fragment
}
102 changes: 102 additions & 0 deletions src/subject_area_scraper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
use scraper::Selector;

use crate::{
scraper::{Course, Page, fetch_url},
text_manipulators::{extract_text, get_html_link_to_page},
Scraper, UrlInvalidError,
};

#[derive(Debug)]
pub struct SubjectAreaPage {
url: String,
subject_area_course_code: String,
subject_area_course_name: String,
school: String,
courses: Vec<Course>,
}


impl Page for SubjectAreaPage {
fn view_page_details(&self) {
println!("{:?}", self)
}
}



#[derive(Debug)]

pub struct SubjectAreaScraper {
pub url: Option<String>,
pub pages: Vec<Box<dyn Page>>,
}

impl std::fmt::Debug for dyn Page {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self.view_page_details())
}
}

impl SubjectAreaScraper {
pub async fn run_scraper_on_url(&mut self) -> Result<(), Box<dyn std::error::Error>> {
match &self.url {
Some(url) => {
let html = fetch_url(url).await?;
println!("{}", html);
let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap();
let code_selector = Selector::parse("td.data").unwrap();
let name_selector = Selector::parse("td.data a").unwrap();
let link_selector = Selector::parse("td.data a").unwrap();
let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
let document = scraper::Html::parse_document(&html);
for row_node in document.select(&row_selector) {
// Extract data from each row
let subject_area_course_code =
extract_text(row_node.select(&code_selector).next().unwrap());
let subject_area_course_name =
extract_text(row_node.select(&name_selector).next().unwrap());
let url = get_html_link_to_page(
row_node
.select(&link_selector)
.next()
.map_or("", |node| node.value().attr("href").unwrap_or("")),
);
let school = extract_text(row_node.select(&school_selector).next().unwrap());
// Create a Course struct and push it to the vector
let page = SubjectAreaPage {
subject_area_course_code,
subject_area_course_name,
url,
school,
courses: Vec::new(),
};

self.add_page(Box::new(page));
}

println!("{:?}", self.pages);
Ok(())
}
None => Err(Box::new(UrlInvalidError)),
}
}
}
impl Scraper for SubjectAreaScraper {
fn new() -> Self {
SubjectAreaScraper {
url: None,
pages: Vec::new(),
}
}

fn set_url(&mut self, url: String) -> Self {
SubjectAreaScraper {
url: Some(url),
pages: Vec::new(),
}
}

fn add_page(&mut self, page: Box::<dyn Page>) {
self.pages.push(page);
}
}
9 changes: 9 additions & 0 deletions src/text_manipulators.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use scraper::ElementRef;

pub fn extract_text(node: ElementRef) -> String {
node.text().collect::<String>()
}

pub fn get_html_link_to_page(html_fragment: &str) -> String {
"https://timetable.unsw.edu.au/2024/".to_string() + html_fragment
}

0 comments on commit 7724177

Please sign in to comment.