Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ray prefix subject area remove #13

Merged
merged 4 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26,510 changes: 14,020 additions & 12,490 deletions classes.json

Large diffs are not rendered by default.

60,980 changes: 42,427 additions & 18,553 deletions courses.json

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM rust:1.80
WORKDIR /app
COPY . .

ARG HASURAGRES_URL
ARG HASURAGRES_API_KEY

ENV TIMETABLE_API_URL=https://timetable.unsw.edu.au/year/

RUN cargo r -- scrape_n_batch_insert -release

2 changes: 1 addition & 1 deletion sql/Classes/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ CREATE TABLE Classes (
"consent" VARCHAR(255) NOT NULL,
"mode" VARCHAR(255) NOT NULL,
"class_notes" TEXT,
FOREIGN KEY ("course_id") REFERENCES Courses("subject_area_course_code") ON DELETE CASCADE
FOREIGN KEY ("course_id") REFERENCES Courses("course_code") ON DELETE CASCADE
);
4 changes: 2 additions & 2 deletions sql/Courses/up.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
CREATE TABLE Courses (
"subject_area_course_code" VARCHAR(8) PRIMARY KEY, --id
"subject_area_course_name" VARCHAR(255) NOT NULL,
"course_code" VARCHAR(8) PRIMARY KEY, --id
"course_name" VARCHAR(255) NOT NULL,
"uoc" INT NOT NULL,
"faculty" VARCHAR(255),
"school" VARCHAR(255),
Expand Down
2 changes: 1 addition & 1 deletion sql/Times/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ CREATE TABLE Times (
"time" VARCHAR(100) NOT NULL,
"weeks" VARCHAR(100) NOT NULL,
FOREIGN KEY ("class_id") REFERENCES Classes("class_id") ON DELETE CASCADE,
FOREIGN KEY ("course_id") REFERENCES Courses("subject_area_course_code") ON DELETE CASCADE
FOREIGN KEY ("course_id") REFERENCES Courses("course_code") ON DELETE CASCADE
);
16 changes: 8 additions & 8 deletions src/class_scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use crate::{

#[derive(Debug)]
pub struct Course {
pub subject_area_course_code: String,
pub subject_area_course_name: String,
pub course_code: String,
pub course_name: String,
pub uoc: i32,
pub faculty: Option<String>,
pub school: Option<String>,
Expand Down Expand Up @@ -49,15 +49,15 @@ pub struct Time {

#[derive(Debug)]
pub struct ClassScraper {
pub subject_area_course_code: String,
pub subject_area_course_name: String,
pub course_code: String,
pub course_name: String,
pub uoc: i32,
pub url: String,
}

impl ClassScraper {
pub async fn scrape(&mut self) -> Result<Course, Box<ScrapeError>> {
println!("Currently working on {:?}", self.subject_area_course_code);
println!("Currently working on {:?}", self.course_code);
let html = fetch_url(&self.url)
.await
.expect(&format!("Something was wrong with the URL: {}", self.url));
Expand All @@ -84,8 +84,8 @@ impl ClassScraper {
.map(|course_name_words| String::from(course_name_words))
.collect();
let mut course_info = Course {
subject_area_course_code: self.subject_area_course_code.clone(),
subject_area_course_name: course_name_code_info.join(" "),
course_code: self.course_code.clone(),
course_name: course_name_code_info.join(" "),
uoc: self.uoc,
faculty: None,
school: None,
Expand Down Expand Up @@ -151,7 +151,7 @@ impl ClassScraper {

course_info.classes = class_activity_information
.into_par_iter()
.map(|class_data| parse_class_info(class_data, self.subject_area_course_code.clone()))
.map(|class_data| parse_class_info(class_data, self.course_code.clone()))
.collect();
let _ = course_info
.classes
Expand Down
5 changes: 2 additions & 3 deletions src/hasuragres_b_insert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use serde_json::Value;
use std::env;
use std::error::Error;
use std::fs::File;
use std::future::IntoFuture;
use std::io::Read;
use std::vec;

Expand Down Expand Up @@ -87,8 +86,8 @@ pub async fn send_batch_data(hdata: &impl HasuragresData) -> Result<(), Box<dyn
metadata: Metadata {
table_name: "courses".to_string(),
columns: vec![
"subject_area_course_code".to_string(),
"subject_area_course_name".to_string(),
"course_code".to_string(),
"course_name".to_string(),
"uoc".to_string(),
"faculty".to_string(),
"school".to_string(),
Expand Down
7 changes: 3 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@ mod school_area_scraper;
mod subject_area_scraper;
mod text_manipulators;

pub use class_scraper::{Class, ClassScraper, Course, Time};
pub use hasuragres_b_insert::{send_batch_data, ReadFromFile, ReadFromMemory};
pub use school_area_scraper::SchoolAreaScraper;
pub use scraper::fetch_url;
pub use scraper::Scraper;
pub use subject_area_scraper::SubjectAreaScraper;
pub use text_manipulators::mutate_string_to_include_curr_year;
pub use url_invalid_error::UrlInvalidError;
// pub use subject_area_scraper::SubjectAreaScraper;
pub use class_scraper::{Class, ClassScraper, Course, Time};
pub use school_area_scraper::SchoolAreaScraper;
pub use subject_area_scraper::SubjectAreaScraper;
12 changes: 6 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ async fn run_school_courses_page_scraper_job(

// Iterate over the pages and create tasks for each scrape operation
for school_area_scrapers in &mut all_school_offered_courses_scraper.pages {
let subject_area_scraper = Arc::clone(&school_area_scrapers.subject_area_scraper);
let scraper = Arc::clone(&school_area_scrapers.subject_area_scraper);
let task = tokio::spawn(async move {
let mut scraper = subject_area_scraper.lock().await;
let mut scraper = scraper.lock().await;
let _ = scraper.scrape().await;
});
tasks.push(task);
Expand All @@ -65,11 +65,11 @@ async fn run_course_classes_page_scraper_job(
let rate_limit_delay = Duration::from_millis(1); // delay between tasks

for school_area_scrapers in &mut all_school_offered_courses_scraper.pages {
let subject_area_scraper = Arc::clone(&school_area_scrapers.subject_area_scraper);
let scraper = Arc::clone(&school_area_scrapers.subject_area_scraper);

// Lock the mutex to access the underlying data
let class_scrapers = {
let scraper = subject_area_scraper.lock().await;
let scraper = scraper.lock().await;
scraper.class_scrapers.clone()
};

Expand Down Expand Up @@ -113,8 +113,8 @@ fn convert_courses_to_json(course_vec: &mut Vec<Course>) -> Vec<serde_json::Valu
let mut json_courses = Vec::new();
for course in course_vec.iter() {
json_courses.push(json!({
"subject_area_course_code": course.subject_area_course_code,
"subject_area_course_name": course.subject_area_course_name,
"course_code": course.course_code,
"course_name": course.course_name,
"uoc": course.uoc,
"faculty": course.faculty,
"school": course.school,
Expand Down
14 changes: 6 additions & 8 deletions src/school_area_scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use tokio::sync::Mutex;

#[derive(Debug)]
pub struct SchoolAreaPage {
pub subject_area_course_code: String,
pub subject_area_course_name: String,
pub course_code: String,
pub course_name: String,
pub school: String,
pub subject_area_scraper: Arc<Mutex<SubjectAreaScraper>>,
}
Expand Down Expand Up @@ -51,10 +51,8 @@ impl SchoolAreaScraper {
let document = scraper::Html::parse_document(&html);
for row_node in document.select(&row_selector) {
// Extract data from each row
let subject_area_course_code =
extract_text(row_node.select(&code_selector).next().unwrap());
let subject_area_course_name =
extract_text(row_node.select(&name_selector).next().unwrap());
let course_code = extract_text(row_node.select(&code_selector).next().unwrap());
let course_name = extract_text(row_node.select(&name_selector).next().unwrap());
let url = get_html_link_to_page(
row_node
.select(&link_selector)
Expand All @@ -63,8 +61,8 @@ impl SchoolAreaScraper {
);
let school = extract_text(row_node.select(&school_selector).next().unwrap());
let page = SchoolAreaPage {
subject_area_course_code,
subject_area_course_name,
course_code,
course_name,
school,
subject_area_scraper: Arc::new(Mutex::new(SubjectAreaScraper::new(url))),
};
Expand Down
10 changes: 4 additions & 6 deletions src/subject_area_scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@ impl SubjectAreaScraper {
let document = scraper::Html::parse_document(&html);
for row_node in document.select(&row_selector) {
// Extract data from each row
let subject_area_course_code =
extract_text(row_node.select(&code_selector).next().unwrap());
let subject_area_course_name =
extract_text(row_node.select(&name_selector).next().unwrap());
let course_code = extract_text(row_node.select(&code_selector).next().unwrap());
let course_name = extract_text(row_node.select(&name_selector).next().unwrap());
let url = get_html_link_to_page(
row_node
.select(&link_selector)
Expand All @@ -46,8 +44,8 @@ impl SubjectAreaScraper {
.parse()
.expect("Could not parse UOC!");
self.class_scrapers.push(Arc::new(Mutex::new(ClassScraper {
subject_area_course_code,
subject_area_course_name,
course_code,
course_name,
uoc,
url,
})));
Expand Down
Loading
Loading