8
8
import de .mediathekview .mlib .daten .GeoLocations ;
9
9
import de .mediathekview .mlib .daten .Resolution ;
10
10
import de .mediathekview .mserver .base .utils .HtmlDocumentUtils ;
11
- import de .mediathekview .mserver .crawler .basic .AbstractCrawler ;
12
- import de .mediathekview .mserver .crawler .basic .AbstractDocumentTask ;
13
- import de .mediathekview .mserver .crawler .basic .AbstractUrlTask ;
14
- import de .mediathekview .mserver .crawler .basic .TopicUrlDTO ;
11
+ import de .mediathekview .mserver .crawler .basic .*;
15
12
import de .mediathekview .mserver .crawler .orf .OrfEpisodeInfoDTO ;
16
13
import de .mediathekview .mserver .crawler .orf .OrfVideoInfoDTO ;
14
+ import de .mediathekview .mserver .crawler .orf .json .OrfMoreEpisodesDeserializer ;
15
+ import de .mediathekview .mserver .crawler .orf .parser .OrfMoreEpisodesParser ;
17
16
import de .mediathekview .mserver .crawler .orf .parser .OrfPlaylistDeserializer ;
18
17
import org .apache .commons .lang3 .StringUtils ;
19
18
import org .apache .logging .log4j .LogManager ;
20
19
import org .apache .logging .log4j .Logger ;
21
20
import org .jsoup .nodes .Document ;
22
21
22
+ import java .io .IOException ;
23
23
import java .lang .reflect .Type ;
24
24
import java .net .MalformedURLException ;
25
25
import java .net .URL ;
29
29
import java .time .format .DateTimeParseException ;
30
30
import java .time .temporal .ChronoUnit ;
31
31
import java .util .*;
32
+ import java .util .concurrent .ConcurrentLinkedQueue ;
32
33
33
34
public class OrfFilmDetailTask extends AbstractDocumentTask <Film , TopicUrlDTO > {
34
35
@@ -40,21 +41,25 @@ public class OrfFilmDetailTask extends AbstractDocumentTask<Film, TopicUrlDTO> {
40
41
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration" ;
41
42
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text" ;
42
43
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist" ;
44
+ private static final String MORE_EPISODES_SELECTOR = "div.more-episodes" ;
43
45
44
46
private static final String ATTRIBUTE_DATETIME = "datetime" ;
45
47
private static final String ATTRIBUTE_DATA_JSB = "data-jsb" ;
46
-
47
48
private static final String PREFIX_AUDIO_DESCRIPTION = "AD |" ;
48
49
49
50
private static final DateTimeFormatter DATE_TIME_FORMATTER =
50
51
DateTimeFormatter .ofPattern ("yyyy-MM-dd HH:mm:ss" );
51
52
53
+ private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken <CrawlerUrlDTO >() {}.getType ();
52
54
private static final Type LIST_EPISODEINFO_TYPE_TOKEN =
53
55
new TypeToken <List <OrfEpisodeInfoDTO >>() {}.getType ();
56
+ private final boolean processMoreEpisodes ;
54
57
55
58
public OrfFilmDetailTask (
56
- final AbstractCrawler aCrawler , final Queue <TopicUrlDTO > aUrlToCrawlDtos ) {
59
+ final AbstractCrawler aCrawler , final Queue <TopicUrlDTO > aUrlToCrawlDtos , boolean processMoreEpisodes ) {
57
60
super (aCrawler , aUrlToCrawlDtos );
61
+
62
+ this .processMoreEpisodes = processMoreEpisodes ;
58
63
}
59
64
60
65
private static Optional <LocalDateTime > parseDate (final Document aDocument ) {
@@ -147,12 +152,22 @@ protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocume
147
152
episode .getDuration ());
148
153
}
149
154
}
155
+
156
+ if (processMoreEpisodes ) {
157
+ final List <TopicUrlDTO > topicUrlDTOS = parseMoreEpisodes (aDocument , aUrlDto .getTopic ());
158
+ topicUrlDTOS .remove (aUrlDto );
159
+ processMoreEpisodes (topicUrlDTOS );
160
+ }
150
161
}
151
162
152
163
@ Override
153
164
protected AbstractUrlTask <Film , TopicUrlDTO > createNewOwnInstance (
154
165
final Queue <TopicUrlDTO > aUrlsToCrawl ) {
155
- return new OrfFilmDetailTask (crawler , aUrlsToCrawl );
166
+ return createNewOwnInstance (aUrlsToCrawl , true );
167
+ }
168
+
169
+ private AbstractUrlTask <Film , TopicUrlDTO > createNewOwnInstance (final Queue <TopicUrlDTO > urlsToCrawl , boolean processMoreEpisodes ) {
170
+ return new OrfFilmDetailTask (crawler , urlsToCrawl , processMoreEpisodes );
156
171
}
157
172
158
173
private void createFilm (
@@ -255,4 +270,37 @@ private List<OrfEpisodeInfoDTO> parseEpisodes(final Document aDocument) {
255
270
256
271
return new ArrayList <>();
257
272
}
273
+
274
+ private List <TopicUrlDTO > parseMoreEpisodes (final Document document , final String topic ) {
275
+ final Optional <String > json = HtmlDocumentUtils .getElementAttributeString (MORE_EPISODES_SELECTOR , ATTRIBUTE_DATA_JSB , document );
276
+ if (json .isPresent ()) {
277
+ final Gson gson =
278
+ new GsonBuilder ()
279
+ .registerTypeAdapter (CRAWLER_URL_TYPE_TOKEN , new OrfMoreEpisodesDeserializer ())
280
+ .create ();
281
+
282
+ CrawlerUrlDTO moreEpisodesUrl = gson .fromJson (json .get (), CRAWLER_URL_TYPE_TOKEN );
283
+ if (moreEpisodesUrl != null ) {
284
+ try {
285
+ final Document moreEpisodesDocument = crawler .requestBodyAsHtmlDocument (moreEpisodesUrl .getUrl ());
286
+ OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser ();
287
+ return parser .parse (moreEpisodesDocument , topic );
288
+ } catch (IOException e ) {
289
+ LOG .error ("OrfFilmDetailTask: loading more episodes url {} failed." , moreEpisodesUrl .getUrl ());
290
+ crawler .incrementAndGetErrorCount ();
291
+ }
292
+ }
293
+ }
294
+
295
+ return new ArrayList <>();
296
+ }
297
+
298
+ private void processMoreEpisodes (final List <TopicUrlDTO > moreFilms ) {
299
+ if (moreFilms != null && !moreFilms .isEmpty ()) {
300
+ final Queue <TopicUrlDTO > queue = new ConcurrentLinkedQueue <>(moreFilms );
301
+ final OrfFilmDetailTask task = (OrfFilmDetailTask ) createNewOwnInstance (queue , false );
302
+ task .fork ();
303
+ taskResults .addAll (task .join ());
304
+ }
305
+ }
258
306
}
0 commit comments