Skip to content

Commit 0c3cb99

Browse files
committed
search more episodes
1 parent 5a91a39 commit 0c3cb99

File tree

10 files changed

+1257
-10
lines changed

10 files changed

+1257
-10
lines changed

src/main/java/de/mediathekview/mserver/crawler/orf/OrfCrawler.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,21 +99,23 @@ private Queue<TopicUrlDTO> getLetterEntries() throws InterruptedException, Execu
9999
@Override
100100
protected RecursiveTask<Set<Film>> createCrawlerTask() {
101101
try {
102+
boolean processMoreEpisodes = false;
102103

103104
final Queue<TopicUrlDTO> shows = new ConcurrentLinkedQueue<>();
104105

105106
if (Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled())) {
106107
shows.addAll(getArchiveEntries());
107-
108108
addShows(shows, getLetterEntries());
109+
processMoreEpisodes = true;
109110
}
110111
addShows(shows, getDaysEntries());
111112

112113
printMessage(
113114
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), shows.size());
114115
getAndSetMaxCount(shows.size());
115116

116-
return new OrfFilmDetailTask(this, shows);
117+
// TODO Problem mit Sport aktuell u.ä. lösen => more episodes pro show setzen (topic ja, day nein?)
118+
return new OrfFilmDetailTask(this, shows, processMoreEpisodes);
117119
} catch (final InterruptedException ex) {
118120
LOG.debug("{} crawler interrupted.", getSender().getName(), ex);
119121
Thread.currentThread().interrupt();
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package de.mediathekview.mserver.crawler.orf.json;
2+
3+
import com.google.gson.JsonDeserializationContext;
4+
import com.google.gson.JsonDeserializer;
5+
import com.google.gson.JsonElement;
6+
import de.mediathekview.mserver.base.utils.JsonUtils;
7+
import de.mediathekview.mserver.base.utils.UrlUtils;
8+
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
9+
import de.mediathekview.mserver.crawler.orf.OrfConstants;
10+
11+
import java.lang.reflect.Type;
12+
import java.util.Optional;
13+
14+
public class OrfMoreEpisodesDeserializer implements JsonDeserializer<CrawlerUrlDTO> {
15+
16+
private static final String ATTRIBUTE_URL = "url";
17+
18+
@Override
19+
public CrawlerUrlDTO deserialize(
20+
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {
21+
22+
final Optional<String> url =
23+
JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL);
24+
return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null);
25+
}
26+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package de.mediathekview.mserver.crawler.orf.parser;
2+
3+
import de.mediathekview.mserver.base.HtmlConsts;
4+
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
5+
import java.util.ArrayList;
6+
import java.util.List;
7+
import org.jsoup.nodes.Document;
8+
9+
public class OrfMoreEpisodesParser {
10+
private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link";
11+
12+
public List<TopicUrlDTO> parse(final Document document, final String topic) {
13+
final List<TopicUrlDTO> result = new ArrayList<>();
14+
15+
document
16+
.select(EPISODES_SELECTOR)
17+
.forEach(
18+
episode -> {
19+
final String url = episode.attr(HtmlConsts.ATTRIBUTE_HREF);
20+
result.add(new TopicUrlDTO(topic, url));
21+
});
22+
23+
return result;
24+
}
25+
}

src/main/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTask.java

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,18 @@
88
import de.mediathekview.mlib.daten.GeoLocations;
99
import de.mediathekview.mlib.daten.Resolution;
1010
import de.mediathekview.mserver.base.utils.HtmlDocumentUtils;
11-
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
12-
import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
13-
import de.mediathekview.mserver.crawler.basic.AbstractUrlTask;
14-
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
11+
import de.mediathekview.mserver.crawler.basic.*;
1512
import de.mediathekview.mserver.crawler.orf.OrfEpisodeInfoDTO;
1613
import de.mediathekview.mserver.crawler.orf.OrfVideoInfoDTO;
14+
import de.mediathekview.mserver.crawler.orf.json.OrfMoreEpisodesDeserializer;
15+
import de.mediathekview.mserver.crawler.orf.parser.OrfMoreEpisodesParser;
1716
import de.mediathekview.mserver.crawler.orf.parser.OrfPlaylistDeserializer;
1817
import org.apache.commons.lang3.StringUtils;
1918
import org.apache.logging.log4j.LogManager;
2019
import org.apache.logging.log4j.Logger;
2120
import org.jsoup.nodes.Document;
2221

22+
import java.io.IOException;
2323
import java.lang.reflect.Type;
2424
import java.net.MalformedURLException;
2525
import java.net.URL;
@@ -29,6 +29,7 @@
2929
import java.time.format.DateTimeParseException;
3030
import java.time.temporal.ChronoUnit;
3131
import java.util.*;
32+
import java.util.concurrent.ConcurrentLinkedQueue;
3233

3334
public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {
3435

@@ -40,21 +41,25 @@ public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {
4041
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration";
4142
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text";
4243
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
44+
private static final String MORE_EPISODES_SELECTOR = "div.more-episodes";
4345

4446
private static final String ATTRIBUTE_DATETIME = "datetime";
4547
private static final String ATTRIBUTE_DATA_JSB = "data-jsb";
46-
4748
private static final String PREFIX_AUDIO_DESCRIPTION = "AD |";
4849

4950
private static final DateTimeFormatter DATE_TIME_FORMATTER =
5051
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
5152

53+
private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken<CrawlerUrlDTO>() {}.getType();
5254
private static final Type LIST_EPISODEINFO_TYPE_TOKEN =
5355
new TypeToken<List<OrfEpisodeInfoDTO>>() {}.getType();
56+
private final boolean processMoreEpisodes;
5457

5558
public OrfFilmDetailTask(
56-
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos) {
59+
final AbstractCrawler aCrawler, final Queue<TopicUrlDTO> aUrlToCrawlDtos, boolean processMoreEpisodes) {
5760
super(aCrawler, aUrlToCrawlDtos);
61+
62+
this.processMoreEpisodes = processMoreEpisodes;
5863
}
5964

6065
private static Optional<LocalDateTime> parseDate(final Document aDocument) {
@@ -147,12 +152,22 @@ protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocume
147152
episode.getDuration());
148153
}
149154
}
155+
156+
if (processMoreEpisodes) {
157+
final List<TopicUrlDTO> topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDto.getTopic());
158+
topicUrlDTOS.remove(aUrlDto);
159+
processMoreEpisodes(topicUrlDTOS);
160+
}
150161
}
151162

152163
@Override
153164
protected AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(
154165
final Queue<TopicUrlDTO> aUrlsToCrawl) {
155-
return new OrfFilmDetailTask(crawler, aUrlsToCrawl);
166+
return createNewOwnInstance(aUrlsToCrawl, true);
167+
}
168+
169+
private AbstractUrlTask<Film, TopicUrlDTO> createNewOwnInstance(final Queue<TopicUrlDTO> urlsToCrawl, boolean processMoreEpisodes) {
170+
return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes);
156171
}
157172

158173
private void createFilm(
@@ -255,4 +270,37 @@ private List<OrfEpisodeInfoDTO> parseEpisodes(final Document aDocument) {
255270

256271
return new ArrayList<>();
257272
}
273+
274+
private List<TopicUrlDTO> parseMoreEpisodes(final Document document, final String topic) {
275+
final Optional<String> json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document);
276+
if (json.isPresent()) {
277+
final Gson gson =
278+
new GsonBuilder()
279+
.registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer())
280+
.create();
281+
282+
CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN);
283+
if (moreEpisodesUrl != null) {
284+
try {
285+
final Document moreEpisodesDocument = crawler.requestBodyAsHtmlDocument(moreEpisodesUrl.getUrl());
286+
OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser();
287+
return parser.parse(moreEpisodesDocument, topic);
288+
} catch (IOException e) {
289+
LOG.error("OrfFilmDetailTask: loading more episodes url {} failed.", moreEpisodesUrl.getUrl());
290+
crawler.incrementAndGetErrorCount();
291+
}
292+
}
293+
}
294+
295+
return new ArrayList<>();
296+
}
297+
298+
private void processMoreEpisodes(final List<TopicUrlDTO> moreFilms) {
299+
if (moreFilms != null && !moreFilms.isEmpty()) {
300+
final Queue<TopicUrlDTO> queue = new ConcurrentLinkedQueue<>(moreFilms);
301+
final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false);
302+
task.fork();
303+
taskResults.addAll(task.join());
304+
}
305+
}
258306
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package de.mediathekview.mserver.crawler.orf.json;
2+
3+
import com.google.gson.JsonElement;
4+
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
5+
import de.mediathekview.mserver.testhelper.JsonFileReader;
6+
import org.junit.jupiter.api.Test;
7+
8+
import static org.junit.jupiter.api.Assertions.*;
9+
10+
class OrfMoreEpisodesDeserializerTest {
11+
12+
@Test
13+
void testDeserialize() {
14+
final JsonElement jsonElement = JsonFileReader.readJson("/orf/orf_film_more_episodes.json");
15+
16+
final OrfMoreEpisodesDeserializer target = new OrfMoreEpisodesDeserializer();
17+
final CrawlerUrlDTO actual = target.deserialize(jsonElement, null, null);
18+
19+
assertNotNull(actual);
20+
assertEquals("https://tvthek.orf.at/lane-plus/other_episodes_of_profile?profileId=13895917&profileSlug=Biester", actual.getUrl());
21+
22+
}
23+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package de.mediathekview.mserver.crawler.orf.parser;
2+
3+
import de.mediathekview.mserver.crawler.basic.TopicUrlDTO;
4+
import de.mediathekview.mserver.testhelper.FileReader;
5+
import org.hamcrest.MatcherAssert;
6+
import org.hamcrest.Matchers;
7+
import org.jsoup.Jsoup;
8+
import org.jsoup.nodes.Document;
9+
import org.junit.jupiter.api.Test;
10+
11+
import java.util.List;
12+
13+
import static org.junit.jupiter.api.Assertions.*;
14+
15+
class OrfMoreEpisodesParserTest {
16+
@Test
17+
void parseDocumentWithEpisodes() {
18+
TopicUrlDTO[] expectedFilms = new TopicUrlDTO[] {
19+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-9/14207236"),
20+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-8/14207235"),
21+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-7/14207234"),
22+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-6/14207233"),
23+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-5/14207232"),
24+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-4/14207231"),
25+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-3/14207230"),
26+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-2/14207229"),
27+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Alle-Folgen-jetzt-Biester-1-10/14207227"),
28+
new TopicUrlDTO("Biester", "https://tvthek.orf.at/profile/Biester/13895917/Biester-Folge-10/14207252"),
29+
};
30+
31+
final Document document = Jsoup.parse(FileReader.readFile("/orf/orf_film_more_episodes.html"));
32+
33+
OrfMoreEpisodesParser target = new OrfMoreEpisodesParser();
34+
final List<TopicUrlDTO> actual = target.parse(document, "Biester");
35+
36+
assertEquals(10, actual.size());
37+
MatcherAssert.assertThat(actual, Matchers.containsInAnyOrder(expectedFilms));
38+
}
39+
}

src/test/java/de/mediathekview/mserver/crawler/orf/tasks/OrfFilmDetailTaskTestBase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public OrfFilmDetailTaskTestBase() {
1212
}
1313

1414
protected Set<Film> executeTask(OrfCrawler crawler, String aTheme, String aRequestUrl) {
15-
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl))
15+
return new OrfFilmDetailTask(crawler, createCrawlerUrlDto(aTheme, aRequestUrl), false)
1616
.invoke();
1717
}
1818
}

0 commit comments

Comments
 (0)