1
1
package mServer .crawler .sender .orf .tasks ;
2
2
3
- import mServer .crawler .sender .base .AbstractUrlTask ;
3
+ import mServer .crawler .sender .base .* ;
4
4
import com .google .gson .Gson ;
5
5
import com .google .gson .GsonBuilder ;
6
6
import com .google .gson .reflect .TypeToken ;
7
7
import de .mediathekview .mlib .daten .DatenFilm ;
8
8
import de .mediathekview .mlib .tool .Log ;
9
+
10
+ import java .io .IOException ;
9
11
import java .lang .reflect .Type ;
10
12
import java .time .Duration ;
11
13
import java .time .LocalDateTime ;
19
21
import java .util .concurrent .ConcurrentLinkedQueue ;
20
22
import mServer .crawler .CrawlerTool ;
21
23
import mServer .crawler .sender .MediathekReader ;
22
- import mServer .crawler .sender .base .Qualities ;
23
- import mServer .crawler .sender .base .HtmlDocumentUtils ;
24
24
import mServer .crawler .sender .orf .OrfEpisodeInfoDTO ;
25
25
import mServer .crawler .sender .orf .OrfVideoInfoDTO ;
26
26
import mServer .crawler .sender .orf .TopicUrlDTO ;
27
+ import mServer .crawler .sender .orf .json .OrfMoreEpisodesDeserializer ;
28
+ import mServer .crawler .sender .orf .parser .OrfMoreEpisodesParser ;
27
29
import mServer .crawler .sender .orf .parser .OrfPlaylistDeserializer ;
28
30
import org .apache .commons .lang3 .StringUtils ;
29
31
import org .jsoup .nodes .Document ;
@@ -36,6 +38,7 @@ public class OrfFilmDetailTask extends OrfTaskBase<DatenFilm, TopicUrlDTO> {
36
38
private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration" ;
37
39
private static final String DESCRIPTION_SELECTOR = ".description-container .description-text" ;
38
40
private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist" ;
41
+ private static final String MORE_EPISODES_SELECTOR = "div.more-episodes" ;
39
42
40
43
private static final String ATTRIBUTE_DATETIME = "datetime" ;
41
44
private static final String ATTRIBUTE_DATA_JSB = "data-jsb" ;
@@ -50,12 +53,18 @@ public class OrfFilmDetailTask extends OrfTaskBase<DatenFilm, TopicUrlDTO> {
50
53
private static final DateTimeFormatter TIME_FORMAT
51
54
= DateTimeFormatter .ofPattern ("HH:mm:ss" );
52
55
56
+ private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken <CrawlerUrlDTO >() {}.getType ();
53
57
private static final Type LIST_EPISODEINFO_TYPE_TOKEN = new TypeToken <List <OrfEpisodeInfoDTO >>() {
54
58
}.getType ();
55
59
60
+ private final boolean processMoreEpisodes ;
61
+ private final transient JsoupConnection jsoupConnection ;
62
+
56
63
public OrfFilmDetailTask (final MediathekReader aCrawler ,
57
- final ConcurrentLinkedQueue <TopicUrlDTO > aUrlToCrawlDTOs ) {
64
+ final ConcurrentLinkedQueue <TopicUrlDTO > aUrlToCrawlDTOs , boolean processMoreEpisodes ) {
58
65
super (aCrawler , aUrlToCrawlDTOs );
66
+ this .processMoreEpisodes = processMoreEpisodes ;
67
+ jsoupConnection = new JsoupConnection ();
59
68
}
60
69
61
70
@ Override
@@ -76,12 +85,22 @@ protected void processDocument(TopicUrlDTO aUrlDTO, Document aDocument) {
76
85
}
77
86
}
78
87
88
+ if (processMoreEpisodes ) {
89
+ final List <TopicUrlDTO > topicUrlDTOS = parseMoreEpisodes (aDocument , aUrlDTO .getTopic ());
90
+ topicUrlDTOS .remove (aUrlDTO );
91
+ processMoreEpisodes (topicUrlDTOS );
92
+ }
93
+
79
94
ORF_LOGGER .trace (String .format ("%s - %s: Anzahl Filme: %d" , aUrlDTO .getTopic (), aUrlDTO .getUrl (), taskResults .size ()));
80
95
}
81
96
82
97
@ Override
83
98
protected AbstractUrlTask <DatenFilm , TopicUrlDTO > createNewOwnInstance (ConcurrentLinkedQueue <TopicUrlDTO > aURLsToCrawl ) {
84
- return new OrfFilmDetailTask (crawler , aURLsToCrawl );
99
+ return createNewOwnInstance (aURLsToCrawl , processMoreEpisodes );
100
+ }
101
+
102
+ private AbstractUrlTask <DatenFilm , TopicUrlDTO > createNewOwnInstance (final ConcurrentLinkedQueue <TopicUrlDTO > urlsToCrawl , boolean processMoreEpisodes ) {
103
+ return new OrfFilmDetailTask (crawler , urlsToCrawl , processMoreEpisodes );
85
104
}
86
105
87
106
private void createFilm (final TopicUrlDTO aUrlDTO ,
@@ -218,4 +237,36 @@ private static Optional<ChronoUnit> determineChronoUnit(String aDuration) {
218
237
219
238
return Optional .empty ();
220
239
}
240
+
241
+ private List <TopicUrlDTO > parseMoreEpisodes (final Document document , final String topic ) {
242
+ final Optional <String > json = HtmlDocumentUtils .getElementAttributeString (MORE_EPISODES_SELECTOR , ATTRIBUTE_DATA_JSB , document );
243
+ if (json .isPresent ()) {
244
+ final Gson gson =
245
+ new GsonBuilder ()
246
+ .registerTypeAdapter (CRAWLER_URL_TYPE_TOKEN , new OrfMoreEpisodesDeserializer ())
247
+ .create ();
248
+
249
+ CrawlerUrlDTO moreEpisodesUrl = gson .fromJson (json .get (), CRAWLER_URL_TYPE_TOKEN );
250
+ if (moreEpisodesUrl != null ) {
251
+ try {
252
+ final Document moreEpisodesDocument = jsoupConnection .getDocument (moreEpisodesUrl .getUrl ());
253
+ OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser ();
254
+ return parser .parse (moreEpisodesDocument , topic );
255
+ } catch (IOException e ) {
256
+ Log .errorLog (237462889 , String .format ("OrfFilmDetailTask: loading more episodes url %s failed." , moreEpisodesUrl .getUrl ()));
257
+ }
258
+ }
259
+ }
260
+
261
+ return new ArrayList <>();
262
+ }
263
+
264
+ private void processMoreEpisodes (final List <TopicUrlDTO > moreFilms ) {
265
+ if (moreFilms != null && !moreFilms .isEmpty ()) {
266
+ final ConcurrentLinkedQueue <TopicUrlDTO > queue = new ConcurrentLinkedQueue <>(moreFilms );
267
+ final OrfFilmDetailTask task = (OrfFilmDetailTask ) createNewOwnInstance (queue , false );
268
+ task .fork ();
269
+ taskResults .addAll (task .join ());
270
+ }
271
+ }
221
272
}
0 commit comments