From ad46f2f33aa5f3b1be9fd97edf0db6ca79d99fc9 Mon Sep 17 00:00:00 2001 From: in-seo Date: Sun, 12 Jan 2025 13:18:03 +0900 Subject: [PATCH] hotfix: okky xss path selector --- .../Matching/SouP/common/SlackNotifier.java | 7 +++-- .../SouP/crawler/Hola/HolaService.java | 24 +++++++++++------ .../SouP/crawler/okky/OkkyService.java | 27 +++++++++---------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/SouP/src/main/java/Matching/SouP/common/SlackNotifier.java b/SouP/src/main/java/Matching/SouP/common/SlackNotifier.java index 414b0bb..ca5e081 100644 --- a/SouP/src/main/java/Matching/SouP/common/SlackNotifier.java +++ b/SouP/src/main/java/Matching/SouP/common/SlackNotifier.java @@ -9,13 +9,13 @@ public class SlackNotifier { private static final OkHttpClient client = new OkHttpClient(); - public void sendMessageToSlack() { + public void sendMessageToSlack(String errorMessage) { String webHookURL = PropertyUtil.getProperty("webhook.url"); String message = "OKKY 파싱 에러"; RequestBody body = RequestBody.create( MediaType.parse("application/json; charset=utf-8"), - "{\"text\":\"" + message + "\"}" + "{\"text\":\"" + message + " " + errorMessage + "\"}" ); Request request = new Request.Builder() @@ -27,9 +27,8 @@ public void sendMessageToSlack() { if (!response.isSuccessful()) { throw new RuntimeException("Unexpected code " + response); } - log.warn("Message sent successfully: " + response.body().string()); } catch (Exception e) { - e.printStackTrace(); + log.error(e.toString()); } } } diff --git a/SouP/src/main/java/Matching/SouP/crawler/Hola/HolaService.java b/SouP/src/main/java/Matching/SouP/crawler/Hola/HolaService.java index 0862dd0..dc0f2d2 100644 --- a/SouP/src/main/java/Matching/SouP/crawler/Hola/HolaService.java +++ b/SouP/src/main/java/Matching/SouP/crawler/Hola/HolaService.java @@ -9,6 +9,7 @@ import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.openqa.selenium.By; +import org.openqa.selenium.NoSuchElementException; import org.openqa.selenium.WebDriver; import org.springframework.stereotype.Service; @@ -37,19 +38,26 @@ public void getHolaPostData(){ Thread.sleep(500); int count = element.select(">a").size(); log.warn("글 갯수 = {} ",count); + boolean hasAdvertise = false; for (int i = count; i > 0; i--) { if(i==count){ driver.findElement(By.cssSelector("#root > main > ul > a:nth-child(1)")).click(); - String first = driver.getCurrentUrl().substring(beginIndex); - if(first.compareTo(standard) <= 0) { - log.warn("사이트 내 가장 최신글 번호 = {}, 따라서 불러올 글이 없습니다!",first); - return; + try { + String first = driver.getCurrentUrl().substring(beginIndex); + if(first.compareTo(standard) <= 0) { + log.warn("사이트 내 가장 최신글 번호 = {}, 따라서 불러올 글이 없습니다!",first); + return; + } + else + driver.navigate().back(); + } catch (NoSuchElementException e) { + hasAdvertise = true; } - else - driver.navigate().back(); } - int aSelector = i*2 - 1; // 홀수번만 사용 예정 - Elements eachPost = element.select("a:nth-child(" + aSelector + ")"); + int childNum = i * 2 - 1; // 홀수번만 사용 예정 + if (hasAdvertise) // 광고가 있을경우 짝수번만 사용 + childNum = i * 2; + Elements eachPost = element.select("a:nth-child(" + childNum + ")"); driver.get(urlHola + eachPost.attr("href")); Thread.sleep(500); Document realPost = Jsoup.parse(driver.getPageSource()); diff --git a/SouP/src/main/java/Matching/SouP/crawler/okky/OkkyService.java b/SouP/src/main/java/Matching/SouP/crawler/okky/OkkyService.java index 8bfdeec..58c231e 100644 --- a/SouP/src/main/java/Matching/SouP/crawler/okky/OkkyService.java +++ b/SouP/src/main/java/Matching/SouP/crawler/okky/OkkyService.java @@ -8,6 +8,7 @@ import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.openqa.selenium.WebDriver; import org.springframework.stereotype.Service; @@ -35,11 +36,11 @@ public void getOkkyPostData() { driver.get(urlOkky + "?page=" + Page); String html = driver.getPageSource(); Document doc = Jsoup.parse(html); - for (int i = 23; i > 4; i--) { //오래된 글부터 크롤링 그럼 반드시 최신글은 DB에서 가장 밑에꺼임. - if(i==10) // 공지, 광고 제거 - continue; - Elements element = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li:nth-child(" + i + ")"); - Elements title = element.select("div > div.my-2 > a"); + Elements elements = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li[class^=\"py\"]\n"); + for (int i = elements.size() - 1; i >= 0; i--) { + Element element = elements.get(i); + Elements title = element.select("div > div.my-2 > div > a"); + // 여기서 각 element에 대한 처리를 진행 String postName = title.text(); String num; try { @@ -94,16 +95,16 @@ private Document click(WebDriver driver, String link) throws InterruptedExceptio private int startPage(WebDriver driver, int start) throws StringIndexOutOfBoundsException { - int page=2; //page가 1이면 okky에선 1페이지이다.. + int page=2; /** - * 디비에서 저장된 가장 최근 글이 1페이지에 있나 여부 판단. 만약 글 리젠이 많아서 2페이지 중반부터 크롤링 해야되면? 3페이지 첫글이 start보다 작아야 됌. + * 디비에서 저장된 가장 최근 글이 1페이지에 있나 여부 판단. 만약 글 리젠이 많아서 2페이지 중반부터 크롤링 해야되면? 3페이지 첫글이 start보다 작아야 됨. * !!다음 페이지의 맨 첫 번째 글이, 가장 최근에 디비에 저장된 글의 번호보다 크면 다음 페이지로 넘어가야됌 */ - int cnt = 4; + int cnt = 1; while(true){ if (page > 5 || cnt > 6) { SlackNotifier slackNotifier = new SlackNotifier(); - slackNotifier.sendMessageToSlack(); + slackNotifier.sendMessageToSlack("시작 페이지를 찾지 못했습니다."); throw new IllegalStateException("오키 파싱 에러"); } driver.get(urlOkky + "?page=" + page); @@ -111,23 +112,21 @@ private int startPage(WebDriver driver, int start) throws StringIndexOutOfBounds Document doc = Jsoup.parse(html); int num = Integer.MAX_VALUE; try { - String href = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li:nth-child(" + cnt + ") > div > div.my-2 > a") - .attr("href"); - + Elements elements = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li[class^=\"py\"]\n"); + String href = elements.get(0).select("div > div.my-2 > div > a").attr("href"); // 각 페이지 첫 글의 번호를 통해 페이지를 선택하자. String sNum = href.substring(10, href.lastIndexOf('?')); num = Integer.parseInt(sNum); }catch (StringIndexOutOfBoundsException | NullPointerException e){ cnt++; log.info("StringIndexOutOfBoundsException"); SlackNotifier slackNotifier = new SlackNotifier(); - slackNotifier.sendMessageToSlack(); + slackNotifier.sendMessageToSlack(e.getMessage()); continue; } if(num