Skip to content

Commit

Permalink
hotfix: okky xss path selector
Browse files Browse the repository at this point in the history
  • Loading branch information
in-seo committed Jan 12, 2025
1 parent 76ca224 commit ad46f2f
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 26 deletions.
7 changes: 3 additions & 4 deletions SouP/src/main/java/Matching/SouP/common/SlackNotifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
public class SlackNotifier {
private static final OkHttpClient client = new OkHttpClient();

public void sendMessageToSlack() {
public void sendMessageToSlack(String errorMessage) {
String webHookURL = PropertyUtil.getProperty("webhook.url");
String message = "OKKY 파싱 에러";

RequestBody body = RequestBody.create(
MediaType.parse("application/json; charset=utf-8"),
"{\"text\":\"" + message + "\"}"
"{\"text\":\"" + message + " " + errorMessage + "\"}"
);

Request request = new Request.Builder()
Expand All @@ -27,9 +27,8 @@ public void sendMessageToSlack() {
if (!response.isSuccessful()) {
throw new RuntimeException("Unexpected code " + response);
}
log.warn("Message sent successfully: " + response.body().string());
} catch (Exception e) {
e.printStackTrace();
log.error(e.toString());
}
}
}
24 changes: 16 additions & 8 deletions SouP/src/main/java/Matching/SouP/crawler/Hola/HolaService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.WebDriver;
import org.springframework.stereotype.Service;

Expand Down Expand Up @@ -37,19 +38,26 @@ public void getHolaPostData(){
Thread.sleep(500);
int count = element.select(">a").size();
log.warn("글 갯수 = {} ",count);
boolean hasAdvertise = false;
for (int i = count; i > 0; i--) {
if(i==count){
driver.findElement(By.cssSelector("#root > main > ul > a:nth-child(1)")).click();
String first = driver.getCurrentUrl().substring(beginIndex);
if(first.compareTo(standard) <= 0) {
log.warn("사이트 내 가장 최신글 번호 = {}, 따라서 불러올 글이 없습니다!",first);
return;
try {
String first = driver.getCurrentUrl().substring(beginIndex);
if(first.compareTo(standard) <= 0) {
log.warn("사이트 내 가장 최신글 번호 = {}, 따라서 불러올 글이 없습니다!",first);
return;
}
else
driver.navigate().back();
} catch (NoSuchElementException e) {
hasAdvertise = true;
}
else
driver.navigate().back();
}
int aSelector = i*2 - 1; // 홀수번만 사용 예정
Elements eachPost = element.select("a:nth-child(" + aSelector + ")");
int childNum = i * 2 - 1; // 홀수번만 사용 예정
if (hasAdvertise) // 광고가 있을경우 짝수번만 사용
childNum = i * 2;
Elements eachPost = element.select("a:nth-child(" + childNum + ")");
driver.get(urlHola + eachPost.attr("href"));
Thread.sleep(500);
Document realPost = Jsoup.parse(driver.getPageSource());
Expand Down
27 changes: 13 additions & 14 deletions SouP/src/main/java/Matching/SouP/crawler/okky/OkkyService.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.springframework.stereotype.Service;
Expand Down Expand Up @@ -35,11 +36,11 @@ public void getOkkyPostData() {
driver.get(urlOkky + "?page=" + Page);
String html = driver.getPageSource();
Document doc = Jsoup.parse(html);
for (int i = 23; i > 4; i--) { //오래된 글부터 크롤링 그럼 반드시 최신글은 DB에서 가장 밑에꺼임.
if(i==10) // 공지, 광고 제거
continue;
Elements element = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li:nth-child(" + i + ")");
Elements title = element.select("div > div.my-2 > a");
Elements elements = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li[class^=\"py\"]\n");
for (int i = elements.size() - 1; i >= 0; i--) {
Element element = elements.get(i);
Elements title = element.select("div > div.my-2 > div > a");
// 여기서 각 element에 대한 처리를 진행
String postName = title.text();
String num;
try {
Expand Down Expand Up @@ -94,40 +95,38 @@ private Document click(WebDriver driver, String link) throws InterruptedExceptio


private int startPage(WebDriver driver, int start) throws StringIndexOutOfBoundsException {
int page=2; //page가 1이면 okky에선 1페이지이다..
int page=2;
/**
* 디비에서 저장된 가장 최근 글이 1페이지에 있나 여부 판단. 만약 글 리젠이 많아서 2페이지 중반부터 크롤링 해야되면? 3페이지 첫글이 start보다 작아야 .
* 디비에서 저장된 가장 최근 글이 1페이지에 있나 여부 판단. 만약 글 리젠이 많아서 2페이지 중반부터 크롤링 해야되면? 3페이지 첫글이 start보다 작아야 .
* !!다음 페이지의 맨 첫 번째 글이, 가장 최근에 디비에 저장된 글의 번호보다 크면 다음 페이지로 넘어가야됌
*/
int cnt = 4;
int cnt = 1;
while(true){
if (page > 5 || cnt > 6) {
SlackNotifier slackNotifier = new SlackNotifier();
slackNotifier.sendMessageToSlack();
slackNotifier.sendMessageToSlack("시작 페이지를 찾지 못했습니다.");
throw new IllegalStateException("오키 파싱 에러");
}
driver.get(urlOkky + "?page=" + page);
String html = driver.getPageSource();
Document doc = Jsoup.parse(html);
int num = Integer.MAX_VALUE;
try {
String href = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li:nth-child(" + cnt + ") > div > div.my-2 > a")
.attr("href");

Elements elements = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li[class^=\"py\"]\n");
String href = elements.get(0).select("div > div.my-2 > div > a").attr("href"); // 각 페이지 첫 글의 번호를 통해 페이지를 선택하자.
String sNum = href.substring(10, href.lastIndexOf('?'));
num = Integer.parseInt(sNum);
}catch (StringIndexOutOfBoundsException | NullPointerException e){
cnt++;
log.info("StringIndexOutOfBoundsException");
SlackNotifier slackNotifier = new SlackNotifier();
slackNotifier.sendMessageToSlack();
slackNotifier.sendMessageToSlack(e.getMessage());
continue;
}
if(num<start){
log.info("{}페이지부터 시작",page-1);
return page-1;
}
cnt=1;
page++;

}
Expand Down

0 comments on commit ad46f2f

Please sign in to comment.