From 16e2afa712d1cccf50b12c7cd19783651baf9d6d Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 8 Nov 2023 21:52:56 +0100 Subject: [PATCH] align classpath config and file config --- MServer-Config.yaml | 2 +- src/main/resources/MServer-Config.yaml | 74 ++++++++++++++++++++------ 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/MServer-Config.yaml b/MServer-Config.yaml index 752cf2d39..d6bc5192d 100644 --- a/MServer-Config.yaml +++ b/MServer-Config.yaml @@ -182,7 +182,7 @@ copySettings: # En- / disables FTP copyEnabled: false - # The paths where to safe the film list files.SrfTopicOverviewTask + # The paths where to safe the film list files. # WARNING: You can only set the path for film list formats you listed in "filmlistSaveFormats". # Required if enabled copyTargetFilePaths: diff --git a/src/main/resources/MServer-Config.yaml b/src/main/resources/MServer-Config.yaml index a08211881..8bb5ac0b5 100644 --- a/src/main/resources/MServer-Config.yaml +++ b/src/main/resources/MServer-Config.yaml @@ -7,6 +7,9 @@ maximumCpuThreads: 16 # If set to 0 the server runs without a time limit. maximumServerDurationInMinutes: 0 +# Rate limiter +maximumRequestsPerSecond: 999.0 + # These Sender will NOT be crawled. # If no Sender are included the server will crawl all Sender but these. #senderExcluded: @@ -77,22 +80,40 @@ writeFilmlistIdFileEnabled: true #The fimlist id file path filmlistIdFilePath: target/filmlist.id -# Sets if a filmlist should be imported -filmlistImporEnabled: false - -# The format of the film list to import. -# Possible are: JSON, OLD_JSON, JSON_COMPRESSED_XZ, OLD_JSON_COMPRESSED_XZ, JSON_COMPRESSED_GZIP, OLD_JSON_COMPRESSED_BZIP, JSON_COMPRESSED_GZIP, OLD_JSON_COMPRESSED_BZIP -#filmlistImportFormat: OLD_JSON_COMPRESSED_XZ - -# The path/URL of the film list to import. -#filmlistImportLocation: http://verteiler1.mediathekview.de/Filmliste-akt.xz +# import additional filmlist sources +importFilmlistConfigurations : + - active: false + path: "someCrawlerlist.json" + format: OLD_JSON + createDiff: false + checkImportListUrl: false + - active: false + path: "someMoreCrawlerlist.json" + format: OLD_JSON + createDiff: false + checkImportListUrl: false + - active: false + path: "https://verteiler1.mediathekview.de/filme-org.xz" + format: OLD_JSON_COMPRESSED_XZ + createDiff: true + checkImportListUrl: true + +# film url is consider invalid if the size is below the minSize +checkImportListUrlMinSize: 5012 + +# abort url checking after x sec +checkImportListUrlTimeoutInSec: 1800 #### Default crawler configurations #### # The maximum amount of URLs to be processed per task. maximumUrlsPerTask: 50 # The maximum duration in minutes a crawler may run. -maximumCrawlDurationInMinutes: 60 +maximumCrawlDurationInMinutes: 120 + +# Enables the topics search +# maximumSubpages limits the depth of the topics search +topicsSearchEnabled: false # The maximum amount of sub pages to be crawled.
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and @@ -113,9 +134,20 @@ socketTimeoutInSeconds: 60 senderConfigurations: ARD: # Actually the ARD has a maximum of 6 days in the past - maximumDaysForSendungVerpasstSection: 6 + maximumDaysForSendungVerpasstSection: 1 + #2,4,8 ok + maximumUrlsPerTask: 32 + #10,20,40 ok + maximumSubpages: 0 + ORF: + #2,4,8 ok + maximumUrlsPerTask: 40 ARTE_DE: - maximumDaysForSendungVerpasstSectionFuture: 21 + maximumUrlsPerTask: 1 + maximumDaysForSendungVerpasstSectionFuture: 0 + maximumRequestsPerSecond: 2.0 + ARTE_FR: + maximumDaysForSendungVerpasstSectionFuture: 0 # The maximum amount of URLs to be processed per task. # maximumUrlsPerTask: 25 # The maximum duration in minutes a crawler may run. @@ -125,9 +157,21 @@ senderConfigurations: # the amount set by this is 5 then the crawler crawls pages 1 to 5. # maximumSubpages: 3 KIKA: - socketTimeoutInSeconds: 120 - - + maximumSubpages: 2 + maximumRequestsPerSecond: 8.0 + SR: + maximumRequestsPerSecond: 2.0 + ZDF: + maximumRequestsPerSecond: 10.0 + FUNK: + maximumUrlsPerTask: 99 + DW: + maximumSubpages: 0 + +# configure string variables +crawlerApiParams: + FUNK_REQUEST_TOKEN: 137782e774d7cadc93dcbffbbde0ce9c + #### COPY #### copySettings: # En- / disables FTP