-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3046365
commit 8c6026d
Showing
4 changed files
with
275 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,17 @@ | ||
CREATE TYPE flights_relative_to_airport AS ENUM ('arrivals', 'departures'); | ||
create table aerolineas_latest_flight_status( | ||
aerolineas_flight_id TEXT not null primary key, | ||
last_updated TIMESTAMP not null, | ||
json JSONB not null | ||
) | ||
|
||
create table flightstats_snapshots( | ||
url TEXT not null, | ||
fetched_at TIMESTAMP not null, | ||
b2_raw_path TEXT not null, | ||
|
||
airport_iata TEXT not null, | ||
flights_relative_to_airport flights_relative_to_airport not null, | ||
last_updated_at TIMESTAMP, | ||
date DATE not null, | ||
|
||
-- JSON | ||
entries TEXT not null, -- FlightstatsSnapshotEntries | ||
PRIMARY KEY (b2_raw_path) | ||
); | ||
|
||
create table aerolineas_snapshots( | ||
url TEXT not null, | ||
fetched_at TIMESTAMP not null, | ||
b2_raw_path TEXT not null, | ||
|
||
airport_iata TEXT not null, | ||
flights_relative_to_airport flights_relative_to_airport not null, | ||
date DATE not null, | ||
|
||
-- JSON | ||
entries TEXT not null, -- array<AerolineasFlightData> | ||
PRIMARY KEY (b2_raw_path) | ||
); | ||
create table airfleets_matriculas( | ||
fetched_at timestamp not null, | ||
matricula text not null, | ||
aeronave text not null, | ||
msn text not null, | ||
compania_aerea text not null, | ||
situacion text not null, | ||
detail_url text not null, | ||
edad_del_avion real not null, | ||
config_de_asientos text not null | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
import { logger, schemaTask } from "@trigger.dev/sdk/v3"; | ||
import { sqlBuilder } from "../consts"; | ||
import fetchBuilder from "fetch-retry"; | ||
import { saveRawIntoB2 } from "../trigger-utils"; | ||
import * as cheerio from "cheerio"; | ||
import { ProxyAgent, fetch as undiciFetch, FormData } from "undici"; | ||
import fetchCookie from "fetch-cookie"; | ||
|
||
const fetch = fetchBuilder(fetchCookie(undiciFetch)); | ||
let dispatcher = (process.env.PROXY_URL && genDispatcher()) || undefined; | ||
|
||
function genDispatcher() { | ||
return new ProxyAgent({ | ||
uri: process.env.PROXY_URL!, | ||
keepAliveTimeout: 180e3, | ||
connectTimeout: 10e3, | ||
bodyTimeout: 15e3, | ||
}); | ||
} | ||
|
||
export const scrapMatriculasTask = schemaTask({ | ||
id: "scrap-matriculas", | ||
maxDuration: 600, | ||
run: async (payload, { ctx }) => { | ||
const sql = sqlBuilder(); | ||
const matriculas = await sql<{ matricula: string }[]>` | ||
select distinct json->>'matricula' as matricula | ||
from aerolineas_latest_flight_status | ||
where json->>'matricula' not in (select matricula from airfleets_matriculas); | ||
`; | ||
|
||
const b2Queue: Array<Promise<any>> = []; | ||
const fetched_at = new Date(); | ||
|
||
logger.info(`Trying to fetch ${matriculas.length} matriculas`); | ||
|
||
for (const { matricula } of matriculas) { | ||
const searchUrl = "https://www.airfleets.es/recherche/?key=" + matricula; | ||
const searchHtml = await fetchAirfleets(searchUrl); | ||
if (searchHtml === 404) { | ||
logger.warn(`404 encontrado para ${matricula}, skipping`); | ||
continue; | ||
} | ||
await saveRawIntoB2({ body: searchHtml, fetched_at, url: searchUrl }); | ||
const $ = cheerio.load(searchHtml); | ||
const table = $("div.ten.columns.padgauche > table:nth-of-type(1)"); | ||
const aeronave = table.find(".tabcontent > td:nth-of-type(1)").text(); | ||
const msn = table.find(".tabcontent > td:nth-of-type(3)").text(); | ||
const compania_aerea = table | ||
.find(".tabcontent > td:nth-of-type(4)") | ||
.text(); | ||
const situacion = table.find(".tabcontent > td:nth-of-type(5)").text(); | ||
|
||
const detail_url = new URL( | ||
$(".tabcontent > td:nth-of-type(1) .lien").attr("href")!, | ||
searchUrl | ||
).toString(); | ||
|
||
const detailHtml = await fetchAirfleets(detail_url); | ||
if (detailHtml === 404) { | ||
logger.warn(`404 encontrado para ${matricula}, skipping`); | ||
continue; | ||
} | ||
await saveRawIntoB2({ body: detailHtml, fetched_at, url: detail_url }); | ||
const detail$ = cheerio.load(detailHtml); | ||
|
||
const edad_del_avion_str = detail$("tr") | ||
.filter(function () { | ||
return $(this).text().includes("Edad del"); | ||
}) | ||
.find(".texten:nth-of-type(2)") | ||
.text() | ||
.trim(); | ||
const edad_del_avion = parseFloat( | ||
edad_del_avion_str.match(/[\d.]+/)?.[0] ?? "0" | ||
); | ||
|
||
const config_de_asientosEl = detail$("tr") | ||
.filter(function () { | ||
return $(this).text().includes("Config de asientos"); | ||
}) | ||
.find(".texten:nth-of-type(2)"); | ||
config_de_asientosEl.find("span").remove(); | ||
const config_de_asientos = config_de_asientosEl.text().trim(); | ||
|
||
await sql` | ||
insert into airfleets_matriculas | ||
(fetched_at, matricula, aeronave, msn, compania_aerea, situacion, detail_url, edad_del_avion, config_de_asientos) | ||
values (${fetched_at}, ${matricula}, ${aeronave}, ${msn}, ${compania_aerea}, ${situacion}, ${detail_url}, ${edad_del_avion}, ${config_de_asientos}) | ||
`; | ||
logger.info(`Inserted ${matricula}`, { | ||
fetched_at, | ||
matricula, | ||
aeronave, | ||
msn, | ||
compania_aerea, | ||
situacion, | ||
detail_url, | ||
edad_del_avion, | ||
config_de_asientos, | ||
}); | ||
} | ||
}, | ||
}); | ||
|
||
const headers = { | ||
"User-Agent": | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", | ||
}; | ||
|
||
async function fetchAirfleets(url: string | URL, captchaAttempts = 0) { | ||
const res = await fetch(url, { | ||
headers, | ||
redirect: "manual", | ||
dispatcher, | ||
}); | ||
if (res.status === 302) { | ||
if (res.headers.get("location")?.includes("captcha.php")) { | ||
if (captchaAttempts > 3) { | ||
logger.info("too many captchas, cambiando proxy...", { url }); | ||
dispatcher = genDispatcher(); | ||
return await fetchAirfleets(url); | ||
} | ||
logger.info("Captcha detectado, resolviendo...", { url }); | ||
const captchaUrl = "https://www.airfleets.es/home/captcha.php"; | ||
const res = await fetch(captchaUrl, { | ||
headers, | ||
dispatcher, | ||
}); | ||
const html = await res.text(); | ||
const $ = cheerio.load(html); | ||
const websiteKey = $(".g-recaptcha").attr("data-sitekey"); | ||
if (!websiteKey) { | ||
console.log(html); | ||
logger.debug("Debug info", { | ||
html, | ||
status: res.status, | ||
headers: Array.from(res.headers.entries()), | ||
}); | ||
throw new Error("No websiteKey found"); | ||
} | ||
const code = await solveRecaptchaV2({ | ||
websiteKey, | ||
websiteURL: captchaUrl, | ||
}); | ||
const form = new FormData(); | ||
form.append("g-recaptcha-response", code); | ||
form.append("org", url.toString()); | ||
await fetch("https://www.airfleets.es/home/captcha2.php", { | ||
method: "POST", | ||
body: form, | ||
headers, | ||
dispatcher, | ||
}); | ||
return await fetchAirfleets(url, captchaAttempts + 1); | ||
} | ||
} | ||
if (res.status !== 200) { | ||
if (res.status === 429) { | ||
logger.debug("Got ratelimited, changing proxy..."); | ||
dispatcher = genDispatcher(); | ||
return await fetchAirfleets(url); | ||
} | ||
if (res.status === 404) return 404; | ||
logger.error("Debug data", { | ||
status: res.status, | ||
headers: Object.entries(res.headers.entries()), | ||
}); | ||
throw new Error(`got status ${res.status}`); | ||
} | ||
const html = await res.text(); | ||
return html; | ||
} | ||
|
||
async function solveRecaptchaV2({ | ||
websiteURL, | ||
websiteKey, | ||
}: { | ||
websiteURL: string; | ||
websiteKey: string; | ||
}) { | ||
const res = await fetch("https://api.2captcha.com/createTask", { | ||
method: "POST", | ||
body: JSON.stringify({ | ||
clientKey: process.env.TWOCAPTCHA_API_KEY, | ||
task: { | ||
type: "RecaptchaV2TaskProxyless", | ||
websiteURL, | ||
websiteKey, | ||
isInvisible: false, | ||
}, | ||
}), | ||
}); | ||
const task: any = await res.json(); | ||
|
||
while (true) { | ||
const res = await fetch("https://api.2captcha.com/getTaskResult", { | ||
method: "POST", | ||
body: JSON.stringify({ | ||
clientKey: process.env.TWOCAPTCHA_API_KEY, | ||
taskId: task.taskId, | ||
}), | ||
}); | ||
const taskResult: any = await res.json(); | ||
if (taskResult.status === "ready") { | ||
logger.debug("Got solved recaptcha", { taskResult }); | ||
return taskResult.solution.gRecaptchaResponse; | ||
} | ||
await new Promise((r) => setTimeout(r, 5000)); | ||
} | ||
} |