From 8c6026d16e919384f14605ad03b75ba9c45f7d3e Mon Sep 17 00:00:00 2001 From: Nulo Date: Tue, 31 Dec 2024 00:21:25 -0300 Subject: [PATCH] scrapear airfleets --- misc/migration.sql | 45 +++----- package.json | 2 + pnpm-lock.yaml | 46 ++++++++ trigger/scrap-airfleets.ts | 211 +++++++++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 29 deletions(-) create mode 100644 trigger/scrap-airfleets.ts diff --git a/misc/migration.sql b/misc/migration.sql index 70cfb99..8a7f430 100644 --- a/misc/migration.sql +++ b/misc/migration.sql @@ -1,30 +1,17 @@ -CREATE TYPE flights_relative_to_airport AS ENUM ('arrivals', 'departures'); +create table aerolineas_latest_flight_status( + aerolineas_flight_id TEXT not null primary key, + last_updated TIMESTAMP not null, + json JSONB not null +) -create table flightstats_snapshots( - url TEXT not null, - fetched_at TIMESTAMP not null, - b2_raw_path TEXT not null, - - airport_iata TEXT not null, - flights_relative_to_airport flights_relative_to_airport not null, - last_updated_at TIMESTAMP, - date DATE not null, - - -- JSON - entries TEXT not null, -- FlightstatsSnapshotEntries - PRIMARY KEY (b2_raw_path) -); - -create table aerolineas_snapshots( - url TEXT not null, - fetched_at TIMESTAMP not null, - b2_raw_path TEXT not null, - - airport_iata TEXT not null, - flights_relative_to_airport flights_relative_to_airport not null, - date DATE not null, - - -- JSON - entries TEXT not null, -- array - PRIMARY KEY (b2_raw_path) -); \ No newline at end of file +create table airfleets_matriculas( + fetched_at timestamp not null, + matricula text not null, + aeronave text not null, + msn text not null, + compania_aerea text not null, + situacion text not null, + detail_url text not null, + edad_del_avion real not null, + config_de_asientos text not null +) \ No newline at end of file diff --git a/package.json b/package.json index 0af2fb4..caf1f73 100644 --- a/package.json +++ b/package.json @@ -9,9 +9,11 @@ "cheerio": "^1.0.0", "date-fns": "^4.1.0", "dayjs": "^1.11.13", + "fetch-cookie": "^3.1.0", "fetch-retry": "^6.0.0", "p-queue": "^8.0.1", "postgres": "^3.4.5", + "undici": "^7.2.0", "waddler": "^0.0.9", "zod": "^3.24.1" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a6b247e..e86c04e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -35,6 +35,9 @@ importers: dayjs: specifier: ^1.11.13 version: 1.11.13 + fetch-cookie: + specifier: ^3.1.0 + version: 3.1.0 fetch-retry: specifier: ^6.0.0 version: 6.0.0 @@ -44,6 +47,9 @@ importers: postgres: specifier: ^3.4.5 version: 3.4.5 + undici: + specifier: ^7.2.0 + version: 7.2.0 waddler: specifier: ^0.0.9 version: 0.0.9(@duckdb/node-api@1.1.3-alpha.7)(duckdb@1.1.3(encoding@0.1.13)) @@ -929,6 +935,9 @@ packages: picomatch: optional: true + fetch-cookie@3.1.0: + resolution: {integrity: sha512-s/XhhreJpqH0ftkGVcQt8JE9bqk+zRn4jF5mPJXWZeQMCI5odV9K+wEWYbnzFPHgQZlvPSMjS4n4yawWE8RINw==} + fetch-retry@6.0.0: resolution: {integrity: sha512-BUFj1aMubgib37I3v4q78fYo63Po7t4HUPTpQ6/QE6yK6cIQrP+W43FYToeTEyg5m2Y7eFUtijUuAv/PDlWuag==} @@ -1324,6 +1333,9 @@ packages: set-blocking@2.0.0: resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} + set-cookie-parser@2.7.1: + resolution: {integrity: sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==} + shebang-command@2.0.0: resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} engines: {node: '>=8'} @@ -1418,6 +1430,17 @@ packages: resolution: {integrity: sha512-Zc+8eJlFMvgatPZTl6A9L/yht8QqdmUNtURHaKZLmKBE12hNPSrqNkUp2cs3M/UKmNVVAMFQYSjYIVHDjW5zew==} engines: {node: '>=12.0.0'} + tldts-core@6.1.70: + resolution: {integrity: sha512-RNnIXDB1FD4T9cpQRErEqw6ZpjLlGdMOitdV+0xtbsnwr4YFka1zpc7D4KD+aAn8oSG5JyFrdasZTE04qDE9Yg==} + + tldts@6.1.70: + resolution: {integrity: sha512-/W1YVgYVJd9ZDjey5NXadNh0mJXkiUMUue9Zebd0vpdo1sU+H4zFFTaJ1RKD4N6KFoHfcXy6l+Vu7bh+bdWCzA==} + hasBin: true + + tough-cookie@5.0.0: + resolution: {integrity: sha512-FRKsF7cz96xIIeMZ82ehjC3xW2E+O2+v11udrDYewUbszngYhsGa8z6YUMMzO9QJZzzyd0nGGXnML/TReX6W8Q==} + engines: {node: '>=16'} + tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} @@ -1455,6 +1478,10 @@ packages: resolution: {integrity: sha512-BUgJXc752Kou3oOIuU1i+yZZypyZRqNPW0vqoMPl8VaoalSfeR0D8/t4iAS3yirs79SSMTxTag+ZC86uswv+Cw==} engines: {node: '>=18.17'} + undici@7.2.0: + resolution: {integrity: sha512-klt+0S55GBViA9nsq48/NSCo4YX5mjydjypxD7UmHh/brMu8h/Mhd/F7qAeoH2NOO8SDTk6kjnTFc4WpzmfYpQ==} + engines: {node: '>=20.18.1'} + unique-filename@2.0.1: resolution: {integrity: sha512-ODWHtkkdx3IAR+veKxFV+VBkUMcN+FaqzUUd7IZzt+0zhDZFPFxhlqwPF3YQvMHx1TD0tdgYl+kuPnJ8E6ql7A==} engines: {node: ^12.13.0 || ^14.15.0 || >=16.0.0} @@ -3045,6 +3072,11 @@ snapshots: optionalDependencies: picomatch: 4.0.2 + fetch-cookie@3.1.0: + dependencies: + set-cookie-parser: 2.7.1 + tough-cookie: 5.0.0 + fetch-retry@6.0.0: {} fs-minipass@2.1.0: @@ -3515,6 +3547,8 @@ snapshots: set-blocking@2.0.0: optional: true + set-cookie-parser@2.7.1: {} + shebang-command@2.0.0: dependencies: shebang-regex: 3.0.0 @@ -3628,6 +3662,16 @@ snapshots: fdir: 6.4.2(picomatch@4.0.2) picomatch: 4.0.2 + tldts-core@6.1.70: {} + + tldts@6.1.70: + dependencies: + tldts-core: 6.1.70 + + tough-cookie@5.0.0: + dependencies: + tldts: 6.1.70 + tr46@0.0.3: optional: true @@ -3647,6 +3691,8 @@ snapshots: undici@6.21.0: {} + undici@7.2.0: {} + unique-filename@2.0.1: dependencies: unique-slug: 3.0.0 diff --git a/trigger/scrap-airfleets.ts b/trigger/scrap-airfleets.ts new file mode 100644 index 0000000..9338dec --- /dev/null +++ b/trigger/scrap-airfleets.ts @@ -0,0 +1,211 @@ +import { logger, schemaTask } from "@trigger.dev/sdk/v3"; +import { sqlBuilder } from "../consts"; +import fetchBuilder from "fetch-retry"; +import { saveRawIntoB2 } from "../trigger-utils"; +import * as cheerio from "cheerio"; +import { ProxyAgent, fetch as undiciFetch, FormData } from "undici"; +import fetchCookie from "fetch-cookie"; + +const fetch = fetchBuilder(fetchCookie(undiciFetch)); +let dispatcher = (process.env.PROXY_URL && genDispatcher()) || undefined; + +function genDispatcher() { + return new ProxyAgent({ + uri: process.env.PROXY_URL!, + keepAliveTimeout: 180e3, + connectTimeout: 10e3, + bodyTimeout: 15e3, + }); +} + +export const scrapMatriculasTask = schemaTask({ + id: "scrap-matriculas", + maxDuration: 600, + run: async (payload, { ctx }) => { + const sql = sqlBuilder(); + const matriculas = await sql<{ matricula: string }[]>` + select distinct json->>'matricula' as matricula + from aerolineas_latest_flight_status + where json->>'matricula' not in (select matricula from airfleets_matriculas); + `; + + const b2Queue: Array> = []; + const fetched_at = new Date(); + + logger.info(`Trying to fetch ${matriculas.length} matriculas`); + + for (const { matricula } of matriculas) { + const searchUrl = "https://www.airfleets.es/recherche/?key=" + matricula; + const searchHtml = await fetchAirfleets(searchUrl); + if (searchHtml === 404) { + logger.warn(`404 encontrado para ${matricula}, skipping`); + continue; + } + await saveRawIntoB2({ body: searchHtml, fetched_at, url: searchUrl }); + const $ = cheerio.load(searchHtml); + const table = $("div.ten.columns.padgauche > table:nth-of-type(1)"); + const aeronave = table.find(".tabcontent > td:nth-of-type(1)").text(); + const msn = table.find(".tabcontent > td:nth-of-type(3)").text(); + const compania_aerea = table + .find(".tabcontent > td:nth-of-type(4)") + .text(); + const situacion = table.find(".tabcontent > td:nth-of-type(5)").text(); + + const detail_url = new URL( + $(".tabcontent > td:nth-of-type(1) .lien").attr("href")!, + searchUrl + ).toString(); + + const detailHtml = await fetchAirfleets(detail_url); + if (detailHtml === 404) { + logger.warn(`404 encontrado para ${matricula}, skipping`); + continue; + } + await saveRawIntoB2({ body: detailHtml, fetched_at, url: detail_url }); + const detail$ = cheerio.load(detailHtml); + + const edad_del_avion_str = detail$("tr") + .filter(function () { + return $(this).text().includes("Edad del"); + }) + .find(".texten:nth-of-type(2)") + .text() + .trim(); + const edad_del_avion = parseFloat( + edad_del_avion_str.match(/[\d.]+/)?.[0] ?? "0" + ); + + const config_de_asientosEl = detail$("tr") + .filter(function () { + return $(this).text().includes("Config de asientos"); + }) + .find(".texten:nth-of-type(2)"); + config_de_asientosEl.find("span").remove(); + const config_de_asientos = config_de_asientosEl.text().trim(); + + await sql` + insert into airfleets_matriculas + (fetched_at, matricula, aeronave, msn, compania_aerea, situacion, detail_url, edad_del_avion, config_de_asientos) + values (${fetched_at}, ${matricula}, ${aeronave}, ${msn}, ${compania_aerea}, ${situacion}, ${detail_url}, ${edad_del_avion}, ${config_de_asientos}) + `; + logger.info(`Inserted ${matricula}`, { + fetched_at, + matricula, + aeronave, + msn, + compania_aerea, + situacion, + detail_url, + edad_del_avion, + config_de_asientos, + }); + } + }, +}); + +const headers = { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", +}; + +async function fetchAirfleets(url: string | URL, captchaAttempts = 0) { + const res = await fetch(url, { + headers, + redirect: "manual", + dispatcher, + }); + if (res.status === 302) { + if (res.headers.get("location")?.includes("captcha.php")) { + if (captchaAttempts > 3) { + logger.info("too many captchas, cambiando proxy...", { url }); + dispatcher = genDispatcher(); + return await fetchAirfleets(url); + } + logger.info("Captcha detectado, resolviendo...", { url }); + const captchaUrl = "https://www.airfleets.es/home/captcha.php"; + const res = await fetch(captchaUrl, { + headers, + dispatcher, + }); + const html = await res.text(); + const $ = cheerio.load(html); + const websiteKey = $(".g-recaptcha").attr("data-sitekey"); + if (!websiteKey) { + console.log(html); + logger.debug("Debug info", { + html, + status: res.status, + headers: Array.from(res.headers.entries()), + }); + throw new Error("No websiteKey found"); + } + const code = await solveRecaptchaV2({ + websiteKey, + websiteURL: captchaUrl, + }); + const form = new FormData(); + form.append("g-recaptcha-response", code); + form.append("org", url.toString()); + await fetch("https://www.airfleets.es/home/captcha2.php", { + method: "POST", + body: form, + headers, + dispatcher, + }); + return await fetchAirfleets(url, captchaAttempts + 1); + } + } + if (res.status !== 200) { + if (res.status === 429) { + logger.debug("Got ratelimited, changing proxy..."); + dispatcher = genDispatcher(); + return await fetchAirfleets(url); + } + if (res.status === 404) return 404; + logger.error("Debug data", { + status: res.status, + headers: Object.entries(res.headers.entries()), + }); + throw new Error(`got status ${res.status}`); + } + const html = await res.text(); + return html; +} + +async function solveRecaptchaV2({ + websiteURL, + websiteKey, +}: { + websiteURL: string; + websiteKey: string; +}) { + const res = await fetch("https://api.2captcha.com/createTask", { + method: "POST", + body: JSON.stringify({ + clientKey: process.env.TWOCAPTCHA_API_KEY, + task: { + type: "RecaptchaV2TaskProxyless", + websiteURL, + websiteKey, + isInvisible: false, + }, + }), + }); + const task: any = await res.json(); + + while (true) { + const res = await fetch("https://api.2captcha.com/getTaskResult", { + method: "POST", + body: JSON.stringify({ + clientKey: process.env.TWOCAPTCHA_API_KEY, + taskId: task.taskId, + }), + }); + const taskResult: any = await res.json(); + if (taskResult.status === "ready") { + logger.debug("Got solved recaptcha", { taskResult }); + return taskResult.solution.gRecaptchaResponse; + } + await new Promise((r) => setTimeout(r, 5000)); + } +}