Skip to content

Commit 0d99bfe

Browse files
committed
search more episodes in long run
1 parent 9c4d052 commit 0d99bfe

File tree

4 files changed

+119
-12
lines changed

4 files changed

+119
-12
lines changed

src/main/java/mServer/crawler/sender/orf/OrfCrawler.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,28 +74,31 @@ private ConcurrentLinkedQueue<TopicUrlDTO> getLetterEntries() throws Interrupted
7474
@Override
7575
protected RecursiveTask<Set<DatenFilm>> createCrawlerTask() {
7676

77+
boolean processMoreEpisodes = false;
78+
7779
final ConcurrentLinkedQueue<TopicUrlDTO> shows = new ConcurrentLinkedQueue<>();
7880
try {
7981

8082
if (CrawlerTool.loadLongMax()) {
8183
shows.addAll(getLetterEntries());
8284
shows.addAll(getArchiveEntries());
85+
processMoreEpisodes = true;
86+
} else {
87+
getDaysEntries().forEach(show -> {
88+
if (!shows.contains(show)) {
89+
shows.add(show);
90+
}
91+
});
8392
}
8493

85-
getDaysEntries().forEach(show -> {
86-
if (!shows.contains(show)) {
87-
shows.add(show);
88-
}
89-
});
90-
9194
} catch (InterruptedException | ExecutionException exception) {
9295
Log.errorLog(56146546, exception);
9396
}
9497
Log.sysLog("ORF Anzahl: " + shows.size());
9598

9699
meldungAddMax(shows.size());
97100

98-
return new OrfFilmDetailTask(this, shows);
101+
return new OrfFilmDetailTask(this, shows, processMoreEpisodes);
99102
}
100103

101104
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package mServer.crawler.sender.orf.json;
2+
3+
import com.google.gson.JsonDeserializationContext;
4+
import com.google.gson.JsonDeserializer;
5+
import com.google.gson.JsonElement;
6+
import mServer.crawler.sender.base.CrawlerUrlDTO;
7+
import mServer.crawler.sender.base.JsonUtils;
8+
import mServer.crawler.sender.base.UrlUtils;
9+
import mServer.crawler.sender.orf.OrfConstants;
10+
11+
import java.lang.reflect.Type;
12+
import java.util.Optional;
13+
14+
public class OrfMoreEpisodesDeserializer implements JsonDeserializer<CrawlerUrlDTO> {
15+
16+
private static final String ATTRIBUTE_URL = "url";
17+
18+
@Override
19+
public CrawlerUrlDTO deserialize(
20+
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {
21+
22+
final Optional<String> url =
23+
JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL);
24+
return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null);
25+
}
26+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package mServer.crawler.sender.orf.parser;
2+
3+
4+
import mServer.crawler.sender.orf.TopicUrlDTO;
5+
import org.jsoup.nodes.Document;
6+
7+
import java.util.ArrayList;
8+
import java.util.List;
9+
10+
public class OrfMoreEpisodesParser {
11+
private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link";
12+
private static final String ATTRIBUTE_HREF = "href";
13+
14+
public List<TopicUrlDTO> parse(final Document document, final String topic) {
15+
final List<TopicUrlDTO> result = new ArrayList<>();
16+
17+
document
18+
.select(EPISODES_SELECTOR)
19+
.forEach(
20+
episode -> {
21+
final String url = episode.attr(ATTRIBUTE_HREF);
22+
result.add(new TopicUrlDTO(topic, url));
23+
});
24+
25+
return result;
26+
}
27+
}

src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
package mServer.crawler.sender.orf.tasks;
22

3-
import mServer.crawler.sender.base.AbstractUrlTask;
3+
import mServer.crawler.sender.base.*;
44
import com.google.gson.Gson;
55
import com.google.gson.GsonBuilder;
66
import com.google.gson.reflect.TypeToken;
77
import de.mediathekview.mlib.daten.DatenFilm;
88
import de.mediathekview.mlib.tool.Log;
9+
10+
import java.io.IOException;
911
import java.lang.reflect.Type;
1012
import java.time.Duration;
1113
import java.time.LocalDateTime;
@@ -19,11 +21,11 @@
1921
import java.util.concurrent.ConcurrentLinkedQueue;
2022
import mServer.crawler.CrawlerTool;
2123
import mServer.crawler.sender.MediathekReader;
22-
import mServer.crawler.sender.base.Qualities;
23-
import mServer.crawler.sender.base.HtmlDocumentUtils;
2424
import mServer.crawler.sender.orf.OrfEpisodeInfoDTO;
2525
import mServer.crawler.sender.orf.OrfVideoInfoDTO;
2626
import mServer.crawler.sender.orf.TopicUrlDTO;
27+
import mServer.crawler.sender.orf.json.OrfMoreEpisodesDeserializer;
28+
import mServer.crawler.sender.orf.parser.OrfMoreEpisodesParser;
2729
import mServer.crawler.sender.orf.parser.OrfPlaylistDeserializer;
2830
import org.apache.commons.lang3.StringUtils;
2931
import org.jsoup.nodes.Document;
@@ -36,6 +38,7 @@ public class OrfFilmDetailTask extends OrfTaskBase<DatenFilm, TopicUrlDTO> {
3638
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration";
3739
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text";
3840
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
41+
private static final String MORE_EPISODES_SELECTOR = "div.more-episodes";
3942

4043
private static final String ATTRIBUTE_DATETIME = "datetime";
4144
private static final String ATTRIBUTE_DATA_JSB = "data-jsb";
@@ -50,12 +53,18 @@ public class OrfFilmDetailTask extends OrfTaskBase<DatenFilm, TopicUrlDTO> {
5053
private static final DateTimeFormatter TIME_FORMAT
5154
= DateTimeFormatter.ofPattern("HH:mm:ss");
5255

56+
private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken<CrawlerUrlDTO>() {}.getType();
5357
private static final Type LIST_EPISODEINFO_TYPE_TOKEN = new TypeToken<List<OrfEpisodeInfoDTO>>() {
5458
}.getType();
5559

60+
private final boolean processMoreEpisodes;
61+
private final transient JsoupConnection jsoupConnection;
62+
5663
public OrfFilmDetailTask(final MediathekReader aCrawler,
57-
final ConcurrentLinkedQueue<TopicUrlDTO> aUrlToCrawlDTOs) {
64+
final ConcurrentLinkedQueue<TopicUrlDTO> aUrlToCrawlDTOs, boolean processMoreEpisodes) {
5865
super(aCrawler, aUrlToCrawlDTOs);
66+
this.processMoreEpisodes = processMoreEpisodes;
67+
jsoupConnection = new JsoupConnection();
5968
}
6069

6170
@Override
@@ -76,12 +85,22 @@ protected void processDocument(TopicUrlDTO aUrlDTO, Document aDocument) {
7685
}
7786
}
7887

88+
if (processMoreEpisodes) {
89+
final List<TopicUrlDTO> topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDTO.getTopic());
90+
topicUrlDTOS.remove(aUrlDTO);
91+
processMoreEpisodes(topicUrlDTOS);
92+
}
93+
7994
ORF_LOGGER.trace(String.format("%s - %s: Anzahl Filme: %d", aUrlDTO.getTopic(), aUrlDTO.getUrl(), taskResults.size()));
8095
}
8196

8297
@Override
8398
protected AbstractUrlTask<DatenFilm, TopicUrlDTO> createNewOwnInstance(ConcurrentLinkedQueue<TopicUrlDTO> aURLsToCrawl) {
84-
return new OrfFilmDetailTask(crawler, aURLsToCrawl);
99+
return createNewOwnInstance(aURLsToCrawl, processMoreEpisodes);
100+
}
101+
102+
private AbstractUrlTask<DatenFilm, TopicUrlDTO> createNewOwnInstance(final ConcurrentLinkedQueue<TopicUrlDTO> urlsToCrawl, boolean processMoreEpisodes) {
103+
return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes);
85104
}
86105

87106
private void createFilm(final TopicUrlDTO aUrlDTO,
@@ -218,4 +237,36 @@ private static Optional<ChronoUnit> determineChronoUnit(String aDuration) {
218237

219238
return Optional.empty();
220239
}
240+
241+
private List<TopicUrlDTO> parseMoreEpisodes(final Document document, final String topic) {
242+
final Optional<String> json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document);
243+
if (json.isPresent()) {
244+
final Gson gson =
245+
new GsonBuilder()
246+
.registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer())
247+
.create();
248+
249+
CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN);
250+
if (moreEpisodesUrl != null) {
251+
try {
252+
final Document moreEpisodesDocument = jsoupConnection.getDocument(moreEpisodesUrl.getUrl());
253+
OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser();
254+
return parser.parse(moreEpisodesDocument, topic);
255+
} catch (IOException e) {
256+
Log.errorLog(237462889, String.format("OrfFilmDetailTask: loading more episodes url %s failed.", moreEpisodesUrl.getUrl()));
257+
}
258+
}
259+
}
260+
261+
return new ArrayList<>();
262+
}
263+
264+
private void processMoreEpisodes(final List<TopicUrlDTO> moreFilms) {
265+
if (moreFilms != null && !moreFilms.isEmpty()) {
266+
final ConcurrentLinkedQueue<TopicUrlDTO> queue = new ConcurrentLinkedQueue<>(moreFilms);
267+
final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false);
268+
task.fork();
269+
taskResults.addAll(task.join());
270+
}
271+
}
221272
}

0 commit comments

Comments
 (0)