Skip to content

Commit

Permalink
scrapear airfleets
Browse files Browse the repository at this point in the history
  • Loading branch information
catdevnull committed Dec 31, 2024
1 parent 3046365 commit 8c6026d
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 29 deletions.
45 changes: 16 additions & 29 deletions misc/migration.sql
Original file line number Diff line number Diff line change
@@ -1,30 +1,17 @@
CREATE TYPE flights_relative_to_airport AS ENUM ('arrivals', 'departures');
create table aerolineas_latest_flight_status(
aerolineas_flight_id TEXT not null primary key,
last_updated TIMESTAMP not null,
json JSONB not null
)

create table flightstats_snapshots(
url TEXT not null,
fetched_at TIMESTAMP not null,
b2_raw_path TEXT not null,

airport_iata TEXT not null,
flights_relative_to_airport flights_relative_to_airport not null,
last_updated_at TIMESTAMP,
date DATE not null,

-- JSON
entries TEXT not null, -- FlightstatsSnapshotEntries
PRIMARY KEY (b2_raw_path)
);

create table aerolineas_snapshots(
url TEXT not null,
fetched_at TIMESTAMP not null,
b2_raw_path TEXT not null,

airport_iata TEXT not null,
flights_relative_to_airport flights_relative_to_airport not null,
date DATE not null,

-- JSON
entries TEXT not null, -- array<AerolineasFlightData>
PRIMARY KEY (b2_raw_path)
);
create table airfleets_matriculas(
fetched_at timestamp not null,
matricula text not null,
aeronave text not null,
msn text not null,
compania_aerea text not null,
situacion text not null,
detail_url text not null,
edad_del_avion real not null,
config_de_asientos text not null
)
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
"cheerio": "^1.0.0",
"date-fns": "^4.1.0",
"dayjs": "^1.11.13",
"fetch-cookie": "^3.1.0",
"fetch-retry": "^6.0.0",
"p-queue": "^8.0.1",
"postgres": "^3.4.5",
"undici": "^7.2.0",
"waddler": "^0.0.9",
"zod": "^3.24.1"
},
Expand Down
46 changes: 46 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

211 changes: 211 additions & 0 deletions trigger/scrap-airfleets.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import { logger, schemaTask } from "@trigger.dev/sdk/v3";
import { sqlBuilder } from "../consts";
import fetchBuilder from "fetch-retry";
import { saveRawIntoB2 } from "../trigger-utils";
import * as cheerio from "cheerio";
import { ProxyAgent, fetch as undiciFetch, FormData } from "undici";
import fetchCookie from "fetch-cookie";

const fetch = fetchBuilder(fetchCookie(undiciFetch));
let dispatcher = (process.env.PROXY_URL && genDispatcher()) || undefined;

function genDispatcher() {
return new ProxyAgent({
uri: process.env.PROXY_URL!,
keepAliveTimeout: 180e3,
connectTimeout: 10e3,
bodyTimeout: 15e3,
});
}

export const scrapMatriculasTask = schemaTask({
id: "scrap-matriculas",
maxDuration: 600,
run: async (payload, { ctx }) => {
const sql = sqlBuilder();
const matriculas = await sql<{ matricula: string }[]>`
select distinct json->>'matricula' as matricula
from aerolineas_latest_flight_status
where json->>'matricula' not in (select matricula from airfleets_matriculas);
`;

const b2Queue: Array<Promise<any>> = [];
const fetched_at = new Date();

logger.info(`Trying to fetch ${matriculas.length} matriculas`);

for (const { matricula } of matriculas) {
const searchUrl = "https://www.airfleets.es/recherche/?key=" + matricula;
const searchHtml = await fetchAirfleets(searchUrl);
if (searchHtml === 404) {
logger.warn(`404 encontrado para ${matricula}, skipping`);
continue;
}
await saveRawIntoB2({ body: searchHtml, fetched_at, url: searchUrl });
const $ = cheerio.load(searchHtml);
const table = $("div.ten.columns.padgauche > table:nth-of-type(1)");
const aeronave = table.find(".tabcontent > td:nth-of-type(1)").text();
const msn = table.find(".tabcontent > td:nth-of-type(3)").text();
const compania_aerea = table
.find(".tabcontent > td:nth-of-type(4)")
.text();
const situacion = table.find(".tabcontent > td:nth-of-type(5)").text();

const detail_url = new URL(
$(".tabcontent > td:nth-of-type(1) .lien").attr("href")!,
searchUrl
).toString();

const detailHtml = await fetchAirfleets(detail_url);
if (detailHtml === 404) {
logger.warn(`404 encontrado para ${matricula}, skipping`);
continue;
}
await saveRawIntoB2({ body: detailHtml, fetched_at, url: detail_url });
const detail$ = cheerio.load(detailHtml);

const edad_del_avion_str = detail$("tr")
.filter(function () {
return $(this).text().includes("Edad del");
})
.find(".texten:nth-of-type(2)")
.text()
.trim();
const edad_del_avion = parseFloat(
edad_del_avion_str.match(/[\d.]+/)?.[0] ?? "0"
);

const config_de_asientosEl = detail$("tr")
.filter(function () {
return $(this).text().includes("Config de asientos");
})
.find(".texten:nth-of-type(2)");
config_de_asientosEl.find("span").remove();
const config_de_asientos = config_de_asientosEl.text().trim();

await sql`
insert into airfleets_matriculas
(fetched_at, matricula, aeronave, msn, compania_aerea, situacion, detail_url, edad_del_avion, config_de_asientos)
values (${fetched_at}, ${matricula}, ${aeronave}, ${msn}, ${compania_aerea}, ${situacion}, ${detail_url}, ${edad_del_avion}, ${config_de_asientos})
`;
logger.info(`Inserted ${matricula}`, {
fetched_at,
matricula,
aeronave,
msn,
compania_aerea,
situacion,
detail_url,
edad_del_avion,
config_de_asientos,
});
}
},
});

const headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
};

async function fetchAirfleets(url: string | URL, captchaAttempts = 0) {
const res = await fetch(url, {
headers,
redirect: "manual",
dispatcher,
});
if (res.status === 302) {
if (res.headers.get("location")?.includes("captcha.php")) {
if (captchaAttempts > 3) {
logger.info("too many captchas, cambiando proxy...", { url });
dispatcher = genDispatcher();
return await fetchAirfleets(url);
}
logger.info("Captcha detectado, resolviendo...", { url });
const captchaUrl = "https://www.airfleets.es/home/captcha.php";
const res = await fetch(captchaUrl, {
headers,
dispatcher,
});
const html = await res.text();
const $ = cheerio.load(html);
const websiteKey = $(".g-recaptcha").attr("data-sitekey");
if (!websiteKey) {
console.log(html);
logger.debug("Debug info", {
html,
status: res.status,
headers: Array.from(res.headers.entries()),
});
throw new Error("No websiteKey found");
}
const code = await solveRecaptchaV2({
websiteKey,
websiteURL: captchaUrl,
});
const form = new FormData();
form.append("g-recaptcha-response", code);
form.append("org", url.toString());
await fetch("https://www.airfleets.es/home/captcha2.php", {
method: "POST",
body: form,
headers,
dispatcher,
});
return await fetchAirfleets(url, captchaAttempts + 1);
}
}
if (res.status !== 200) {
if (res.status === 429) {
logger.debug("Got ratelimited, changing proxy...");
dispatcher = genDispatcher();
return await fetchAirfleets(url);
}
if (res.status === 404) return 404;
logger.error("Debug data", {
status: res.status,
headers: Object.entries(res.headers.entries()),
});
throw new Error(`got status ${res.status}`);
}
const html = await res.text();
return html;
}

async function solveRecaptchaV2({
websiteURL,
websiteKey,
}: {
websiteURL: string;
websiteKey: string;
}) {
const res = await fetch("https://api.2captcha.com/createTask", {
method: "POST",
body: JSON.stringify({
clientKey: process.env.TWOCAPTCHA_API_KEY,
task: {
type: "RecaptchaV2TaskProxyless",
websiteURL,
websiteKey,
isInvisible: false,
},
}),
});
const task: any = await res.json();

while (true) {
const res = await fetch("https://api.2captcha.com/getTaskResult", {
method: "POST",
body: JSON.stringify({
clientKey: process.env.TWOCAPTCHA_API_KEY,
taskId: task.taskId,
}),
});
const taskResult: any = await res.json();
if (taskResult.status === "ready") {
logger.debug("Got solved recaptcha", { taskResult });
return taskResult.solution.gRecaptchaResponse;
}
await new Promise((r) => setTimeout(r, 5000));
}
}

0 comments on commit 8c6026d

Please sign in to comment.