Skip to content

Commit

Permalink
cleaned code
Browse files Browse the repository at this point in the history
  • Loading branch information
Rayahhhmed committed Jan 8, 2024
1 parent b36bc87 commit a9ba7cd
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 105 deletions.
159 changes: 116 additions & 43 deletions src/class_scraper.rs
Original file line number Diff line number Diff line change
@@ -1,43 +1,116 @@
// pub async fn run_scraper(&mut self) -> Result<(), Box<dyn std::error::Error>> {
// match &self.url {
// Some(url) => {
// let html = self.fetch_url(url).await?;
// println!("{}", html);
// let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap();
// let code_selector = Selector::parse("td.data").unwrap();
// let name_selector = Selector::parse("td.data a").unwrap();
// let link_selector = Selector::parse("td.data a").unwrap();
// let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
// let document = scraper::Html::parse_document(&html);
// for row_node in document.select(&row_selector) {
// // Extract data from each row
// let subject_area_course_code =
// extract_text(row_node.select(&code_selector).next().unwrap());
// let subject_area_course_name =
// extract_text(row_node.select(&name_selector).next().unwrap());
// let url = get_html_link_to_page(
// row_node
// .select(&link_selector)
// .next()
// .map_or("", |node| node.value().attr("href").unwrap_or("")),
// );
// let school = extract_text(row_node.select(&school_selector).next().unwrap());
// // Create a Course struct and push it to the vector
// let page = Page {
// subject_area_course_code,
// subject_area_course_name,
// url,
// school,
// courses: Vec::new(),
// };

// self.add_page(page);

// }

// println!("{:?}", self.pages);
// Ok(())
// }
// None => Err(Box::new(UrlInvalidError)),
// }
// }
use scraper::Selector;

use crate::{
scraper::{fetch_url, Course, Page, Term, Status, Enrolment, DateBlock, ClassTimeBlock},
text_manipulators::{extract_text, get_html_link_to_page},
Scraper, UrlInvalidError,
};

#[derive(Debug)]
pub struct ClassPage {
url: String,
subject_area_course_code: String,
subject_area_course_name: String,
school: String,
courses: Vec<Course>,
}



#[derive(Debug)]
pub struct Class {
class_id: u32,
section: String,
term: Term,
activity: String,
status: Status,
course_enrolment: Enrolment,
term_date: DateBlock,
mode: String,
times: Vec<ClassTimeBlock>,
}



impl Page for ClassPage {
fn view_page_details(&self) {
println!("{:?}", self)
}
}

#[derive(Debug)]

pub struct SubjectAreaScraper {
pub url: Option<String>,
pub pages: Vec<Box<dyn Page>>,
}

impl std::fmt::Debug for dyn Page {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self.view_page_details())
}
}

impl SubjectAreaScraper {
pub async fn run_scraper_on_url(&mut self) -> Result<(), Box<dyn std::error::Error>> {
match &self.url {
Some(url) => {
let html = fetch_url(url).await?;
// println!("{}", html);
let form_body_selector = Selector::parse("td.formBody tbody tr td.formBody").unwrap();
let code_selector = Selector::parse("tr.label").unwrap();
let name_selector = Selector::parse("td.data a").unwrap();
let link_selector = Selector::parse("td.data a").unwrap();
let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
let document = scraper::Html::parse_document(&html);
// for row_node in document.select(&row_selector) {
// // Extract data from each row
// let subject_area_course_code =
// extract_text(row_node.select(&code_selector).next().unwrap());
// let subject_area_course_name =
// extract_text(row_node.select(&name_selector).next().unwrap());
// let url = get_html_link_to_page(
// row_node
// .select(&link_selector)
// .next()
// .map_or("", |node| node.value().attr("href").unwrap_or("")),
// );
// let school = extract_text(row_node.select(&school_selector).next().unwrap());
// // Create a Course struct and push it to the vector
// let page = SubjectAreaPage {
// subject_area_course_code,
// subject_area_course_name,
// url,
// school,
// courses: Vec::new(),
// };

// self.add_page(Box::new(page));
// }

println!("{:?}", self.pages);
Ok(())
}
None => Err(Box::new(UrlInvalidError)),
}
}
}
impl Scraper for SubjectAreaScraper {
fn new() -> Self {
SubjectAreaScraper {
url: None,
pages: Vec::new(),
}
}

fn set_url(&mut self, url: String) -> Self {
SubjectAreaScraper {
url: Some(url),
pages: Vec::new(),
}
}

fn add_page(&mut self, page: Box<dyn Page>) {
self.pages.push(page);
}
}
150 changes: 150 additions & 0 deletions src/course_scraper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
use scraper::Selector;

use crate::{
scraper::{Course, Page, fetch_url},
text_manipulators::{extract_text, get_html_link_to_page},
Scraper, UrlInvalidError,
};

#[derive(Debug)]
pub struct CourseAreaPage {
url: String,
subject_area_course_code: String,
subject_area_course_name: String,
school: String,
courses: Vec<Course>,
}


impl Page for CourseAreaPage {
fn view_page_details(&self) {
println!("{:?}", self)
}
}



#[derive(Debug)]

pub struct CourseScraper {
pub url: Option<String>,
pub pages: Vec<Box<dyn Page>>,
}

impl std::fmt::Debug for dyn Page {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self.view_page_details())
}
}

impl CourseScraper {
pub async fn run_scraper_on_url(&mut self) -> Result<(), Box<dyn std::error::Error>> {
match &self.url {
Some(url) => {
let html = fetch_url(url).await?;
println!("{}", html);
let row_selector = Selector::parse("tr.rowLowlight, tr.rowHighlight").unwrap();
let code_selector = Selector::parse("td.data").unwrap();
let name_selector = Selector::parse("td.data a").unwrap();
let link_selector = Selector::parse("td.data a").unwrap();
let school_selector = Selector::parse("td.data:nth-child(3)").unwrap();
let document = scraper::Html::parse_document(&html);
for row_node in document.select(&row_selector) {
// Extract data from each row
let subject_area_course_code =
extract_text(row_node.select(&code_selector).next().unwrap());
let subject_area_course_name =
extract_text(row_node.select(&name_selector).next().unwrap());
let url = get_html_link_to_page(
row_node
.select(&link_selector)
.next()
.map_or("", |node| node.value().attr("href").unwrap_or("")),
);
let school = extract_text(row_node.select(&school_selector).next().unwrap());
// Create a Course struct and push it to the vector
let page = SubjectAreaPage {
subject_area_course_code,
subject_area_course_name,
url,
school,
courses: Vec::new(),
};

self.add_page(Box::new(page));
}

println!("{:?}", self.pages);
Ok(())
}
None => Err(Box::new(UrlInvalidError)),
}
}

pub async fn run_course_scraper(&mut self) -> Result<(), Box<dyn std::error::Error>> {

match &self.url {
Some(url) => {
let html = fetch_url(url).await?;
println!("{}", html);

let row_selector = Selector::parse("tr.rowLowlight td.data").unwrap();
let code_selector = Selector::parse("td.data").unwrap();
let name_selector = Selector::parse("td.data a").unwrap();
let link_selector = Selector::parse("td.data a").unwrap();
let uoc_selector = Selector::parse("td.data:nth-child(3)").unwrap();
let document = scraper::Html::parse_document(&html);
for row_node in document.select(&row_selector) {
// // Extract data from each row
// let subject_area_course_code =
// extract_text(row_node.select(&code_selector).next().unwrap());
// let subject_area_course_name =
// extract_text(row_node.select(&name_selector).next().unwrap());
// let url = get_html_link_to_page(
// row_node
// .select(&link_selector)
// .next()
// .map_or("", |node| node.value().attr("href").unwrap_or("")),
// );
// let school = extract_text(row_node.select(&school_selector).next().unwrap());
// // Create a Course struct and push it to the vector
// let page = SubjectAreaPage {
// subject_area_course_code,
// subject_area_course_name,
// url,
// school,
// courses: Vec::new(),
// };

self.add_page(Box::new(page));
}

println!("{:?}", self.pages);
Ok(())
}
None => Err(Box::new(UrlInvalidError)),
}
}
}

}

impl Scraper for SubjectAreaScraper {
fn new() -> Self {
SubjectAreaScraper {
url: None,
pages: Vec::new(),
}
}

fn set_url(&mut self, url: String) -> Self {
SubjectAreaScraper {
url: Some(url),
pages: Vec::new(),
}
}

fn add_page(&mut self, page: Box::<dyn Page>) {
self.pages.push(page);
}
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ mod subject_area_scraper;
mod text_manipulators;

pub use scraper::Scraper;
pub use subject_area_scraper::SubjectAreaScraper;
pub use url_invalid_error::UrlInvalidError;
pub use subject_area_scraper::SubjectAreaScraper;
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use spooderman::{Scraper, SubjectAreaScraper};

#[tokio::main]
async fn main() {
let mut scraper = SubjectAreaScraper::new()
let mut scraper = SubjectAreaScraper::new()
.set_url("https://timetable.unsw.edu.au/2024/subjectSearch.html".to_string());
match scraper.run_scraper_on_url().await {
Ok(_) => {
Expand Down
Loading

0 comments on commit a9ba7cd

Please sign in to comment.