teragrep · Tiihott · Sep 11, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 16, 2024
diff --git a/pom.xml b/pom.xml
diff --git a/src/main/avro/KafkaRecord.avsc b/src/main/avro/KafkaRecord.avsc
@@ -0,0 +1,15 @@
+{"namespace": "com.teragrep.pth_06.avro",
+  "type": "record",
+  "name": "SyslogRecord",
+  "fields": [
+    {"name": "timestamp", "type": "long"},
+    {"name": "directory", "type": "string"},
+    {"name": "stream", "type": "string"},
+    {"name": "host", "type": "string"},
+    {"name": "input", "type": "string"},
+    {"name": "partition", "type": "string"},
+    {"name": "offset", "type": "long"},
+    {"name": "origin", "type": "string"},
+    {"name": "payload", "type": "string"}
+  ]
+}
diff --git a/src/main/java/com/teragrep/pth_06/ArchiveMicroStreamReader.java b/src/main/java/com/teragrep/pth_06/ArchiveMicroStreamReader.java
@@ -46,12 +46,15 @@
 package com.teragrep.pth_06;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.gson.JsonArray;
 import com.teragrep.pth_06.config.Config;
 import com.teragrep.pth_06.planner.*;
 import com.teragrep.pth_06.planner.offset.DatasourceOffset;
+import com.teragrep.pth_06.planner.offset.HdfsOffset;
 import com.teragrep.pth_06.planner.offset.KafkaOffset;
 import com.teragrep.pth_06.scheduler.*;
 import com.teragrep.pth_06.task.ArchiveMicroBatchInputPartition;
+import com.teragrep.pth_06.task.HdfsMicroBatchInputPartition;
 import com.teragrep.pth_06.task.TeragrepPartitionReaderFactory;
 import com.teragrep.pth_06.task.KafkaMicroBatchInputPartition;
 import org.apache.spark.sql.connector.read.InputPartition;
@@ -87,6 +90,8 @@ public final class ArchiveMicroStreamReader implements MicroBatchStream {
     private final Config config;
     private final ArchiveQuery aq;
     private final KafkaQuery kq;
+    private final HdfsQuery hq;
+    private final JsonArray hdfsOffsets;
 
     /**
      * Constructor for ArchiveMicroStreamReader
@@ -98,6 +103,15 @@ public final class ArchiveMicroStreamReader implements MicroBatchStream {
 
         this.config = config;
 
+        if (config.isHdfsEnabled) {
+            this.hq = new HdfsQueryProcessor(config);
+            hdfsOffsets = hq.hdfsOffsetMapToJSON();
+        }
+        else {
+            this.hq = new HdfsQueryProcessor();
+            hdfsOffsets = new JsonArray();
+        }
+
         if (config.isArchiveEnabled) {
             this.aq = new ArchiveQueryProcessor(config);
         }
@@ -106,7 +120,13 @@ public final class ArchiveMicroStreamReader implements MicroBatchStream {
         }
 
         if (config.isKafkaEnabled) {
-            this.kq = new KafkaQueryProcessor(config);
+            if (config.isHdfsEnabled) {
+                this.kq = new KafkaQueryProcessor(config);
+                this.kq.seekToHdfsOffsets(hdfsOffsets);
+            }
+            else {
+                this.kq = new KafkaQueryProcessor(config);
+            }
         }
         else {
             this.kq = null;
@@ -119,10 +139,18 @@ public final class ArchiveMicroStreamReader implements MicroBatchStream {
      * Used for testing.
      */
     @VisibleForTesting
-    ArchiveMicroStreamReader(ArchiveQuery aq, KafkaQuery kq, Config config) {
+    ArchiveMicroStreamReader(HdfsQuery hq, ArchiveQuery aq, KafkaQuery kq, Config config) {
         this.config = config;
-        this.aq = aq;
-        this.kq = kq;
+        this.aq = aq; // Uses its own hardcoded query string defined in MockTeragrepDatasource.
+        this.kq = kq; // Skips using query string (and thus topic filtering) altogether.
+        this.hq = hq; // Uses the query string from config for topic filtering.
+        if (!this.hq.isStub() && this.kq != null) {
+            hdfsOffsets = this.hq.hdfsOffsetMapToJSON();
+            this.kq.seekToHdfsOffsets(hdfsOffsets);
+        }
+        else {
+            hdfsOffsets = new JsonArray();
+        }
 
         LOGGER.debug("@VisibleForTesting MicroBatchReader> initialized");
     }
@@ -137,25 +165,37 @@ public final class ArchiveMicroStreamReader implements MicroBatchStream {
     public Offset initialOffset() {
         // archive only: subtract 3600s (1 hour) from earliest to return first row (start exclusive)
         DatasourceOffset rv;
-        if (this.config.isArchiveEnabled && !this.config.isKafkaEnabled) {
-            // only archive
-            rv = new DatasourceOffset(new LongOffset(this.aq.getInitialOffset() - 3600L));
+        HdfsOffset hdfsOffset = new HdfsOffset(); // stub
+        LongOffset longOffset = null; // Refactor null usage
+        KafkaOffset kafkaOffset = new KafkaOffset(); // stub
+
+        if (this.config.isHdfsEnabled) {
+            hdfsOffset = new HdfsOffset(this.hq.getBeginningOffsets().getOffsetMap());
+            hdfsOffset.serialize();
         }
-        else if (!this.config.isArchiveEnabled && this.config.isKafkaEnabled) {
-            // only kafka
-            rv = new DatasourceOffset(new KafkaOffset(this.kq.getBeginningOffsets(null)));
+        if (this.config.isArchiveEnabled) {
+            longOffset = new LongOffset(this.aq.getInitialOffset() - 3600L);
         }
-        else if (this.config.isArchiveEnabled) {
-            // both
-            rv = new DatasourceOffset(
-                    new LongOffset(this.aq.getInitialOffset() - 3600L),
-                    new KafkaOffset(this.kq.getBeginningOffsets(null))
-            );
+        if (this.config.isKafkaEnabled) {
+            if (this.config.isHdfsEnabled) {
+                if (hdfsOffsets.size() > 0) {
+                    kafkaOffset = new KafkaOffset(this.kq.getConsumerPositions(hdfsOffsets));
+                    kafkaOffset.serialize();
+                }
+                else {
+                    kafkaOffset = new KafkaOffset(this.kq.getBeginningOffsets(null));
+                    kafkaOffset.serialize();
+                }
+            }
+            else {
+                kafkaOffset = new KafkaOffset(this.kq.getBeginningOffsets(null));
+                kafkaOffset.serialize();
+            }
         }
-        else {
-            // neither
-            throw new IllegalStateException("no datasources enabled, can't get initial offset");
+        if (hdfsOffset.isStub() && longOffset == null && kafkaOffset.isStub()) {
+            throw new IllegalStateException("no datasources enabled, can't get latest offset");
         }
+        rv = new DatasourceOffset(hdfsOffset, longOffset, kafkaOffset);
         LOGGER.debug("offset[initial]= {}", rv);
         return rv;
     }
@@ -172,6 +212,9 @@ public void commit(Offset offset) {
         if (this.config.isArchiveEnabled) {
             this.aq.commit(((DatasourceOffset) offset).getArchiveOffset().offset());
         }
+        if (this.config.isHdfsEnabled) {
+            this.hq.commit(((DatasourceOffset) offset).getHdfsOffset());
+        }
     }
 
     /** {@inheritDoc} */
@@ -188,26 +231,25 @@ public void stop() {
     @Override
     public Offset latestOffset() {
         DatasourceOffset rv;
-        if (this.config.isArchiveEnabled && !this.config.isKafkaEnabled) {
-            // only archive
-            rv = new DatasourceOffset(new LongOffset(this.aq.incrementAndGetLatestOffset()));
+        HdfsOffset hdfsOffset = new HdfsOffset();
+        LongOffset longOffset = null;
+        KafkaOffset kafkaOffset = new KafkaOffset();
+
+        if (this.config.isHdfsEnabled) {
+            hdfsOffset = new HdfsOffset(this.hq.incrementAndGetLatestOffset().getOffsetMap());
+            hdfsOffset.serialize();
         }
-        else if (!this.config.isArchiveEnabled && this.config.isKafkaEnabled) {
-            // only kafka
-            rv = new DatasourceOffset(new KafkaOffset(this.kq.getInitialEndOffsets()));
+        if (this.config.isArchiveEnabled) {
+            longOffset = new LongOffset(this.aq.incrementAndGetLatestOffset());
         }
-        else if (this.config.isArchiveEnabled) {
-            // both
-            rv = new DatasourceOffset(
-                    new LongOffset(this.aq.incrementAndGetLatestOffset()),
-                    new KafkaOffset(this.kq.getInitialEndOffsets())
-            );
+        if (this.config.isKafkaEnabled) {
+            kafkaOffset = new KafkaOffset(this.kq.getInitialEndOffsets());
+            kafkaOffset.serialize();
         }
-        else {
-            // neither
+        if (hdfsOffset.isStub() && longOffset == null && kafkaOffset.isStub()) {
             throw new IllegalStateException("no datasources enabled, can't get latest offset");
         }
-
+        rv = new DatasourceOffset(hdfsOffset, longOffset, kafkaOffset);
         LOGGER.debug("offset[latest]= {}", rv);
         return rv;
     }
@@ -223,18 +265,34 @@ else if (this.config.isArchiveEnabled) {
     public InputPartition[] planInputPartitions(Offset start, Offset end) {
         List<InputPartition> inputPartitions = new LinkedList<>();
 
-        Batch currentBatch = new Batch(config, aq, kq).processRange(start, end);
+        Batch currentBatch = new Batch(config, hq, aq, kq).processRange(start, end);
 
         for (LinkedList<BatchSlice> taskObjectList : currentBatch) {
-
             // archive tasks
             LinkedList<ArchiveS3ObjectMetadata> archiveTaskList = new LinkedList<>();
+            // HDFS tasks
+            LinkedList<HdfsFileMetadata> hdfsTaskList = new LinkedList<>();
             for (BatchSlice batchSlice : taskObjectList) {
                 if (batchSlice.type.equals(BatchSlice.Type.ARCHIVE)) {
                     archiveTaskList.add(batchSlice.archiveS3ObjectMetadata);
                 }
+                if (batchSlice.type.equals(BatchSlice.Type.HDFS)) {
+                    hdfsTaskList.add(batchSlice.hdfsFileMetadata);
+                }
+                if (batchSlice.type.equals(BatchSlice.Type.KAFKA)) {
+                    inputPartitions
+                            .add(
+                                    new KafkaMicroBatchInputPartition(
+                                            config.kafkaConfig.executorOpts,
+                                            batchSlice.kafkaTopicPartitionOffsetMetadata.topicPartition,
+                                            batchSlice.kafkaTopicPartitionOffsetMetadata.startOffset,
+                                            batchSlice.kafkaTopicPartitionOffsetMetadata.endOffset,
+                                            config.kafkaConfig.executorConfig,
+                                            config.kafkaConfig.skipNonRFC5424Records
+                                    )
+                            );
+                }
             }
-
             if (!archiveTaskList.isEmpty()) {
                 inputPartitions
                         .add(
@@ -251,22 +309,8 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) {
                                 )
                         );
             }
-
-            // kafka tasks
-            for (BatchSlice batchSlice : taskObjectList) {
-                if (batchSlice.type.equals(BatchSlice.Type.KAFKA)) {
-                    inputPartitions
-                            .add(
-                                    new KafkaMicroBatchInputPartition(
-                                            config.kafkaConfig.executorOpts,
-                                            batchSlice.kafkaTopicPartitionOffsetMetadata.topicPartition,
-                                            batchSlice.kafkaTopicPartitionOffsetMetadata.startOffset,
-                                            batchSlice.kafkaTopicPartitionOffsetMetadata.endOffset,
-                                            config.kafkaConfig.executorConfig,
-                                            config.kafkaConfig.skipNonRFC5424Records
-                                    )
-                            );
-                }
+            if (!hdfsTaskList.isEmpty()) {
+                inputPartitions.add(new HdfsMicroBatchInputPartition(config.hdfsConfig, hdfsTaskList));
             }
         }
 

diff --git a/src/main/java/com/teragrep/pth_06/FileSystemFactory.java b/src/main/java/com/teragrep/pth_06/FileSystemFactory.java
@@ -0,0 +1,56 @@
+/*
+ * Teragrep Archive Datasource (pth_06)
+ * Copyright (C) 2021-2024 Suomen Kanuuna Oy
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ *
+ * Additional permission under GNU Affero General Public License version 3
+ * section 7
+ *
+ * If you modify this Program, or any covered work, by linking or combining it
+ * with other code, such other code is not for that reason alone subject to any
+ * of the requirements of the GNU Affero GPL version 3 as long as this Program
+ * is the same Program as licensed from Suomen Kanuuna Oy without any additional
+ * modifications.
+ *
+ * Supplemented terms under GNU Affero General Public License version 3
+ * section 7
+ *
+ * Origin of the software must be attributed to Suomen Kanuuna Oy. Any modified
+ * versions must be marked as "Modified version of" The Program.
+ *
+ * Names of the licensors and authors may not be used for publicity purposes.
+ *
+ * No rights are granted for use of trade names, trademarks, or service marks
+ * which are in The Program if any.
+ *
+ * Licensee must indemnify licensors and authors for any liability that these
+ * contractual assumptions impose on licensors and authors.
+ *
+ * To the extent this program is licensed as part of the Commercial versions of
+ * Teragrep, the applicable Commercial License may apply to this file if you as
+ * a licensee so wish it.
+ */
+package com.teragrep.pth_06;
+
+import org.apache.hadoop.fs.FileSystem;
+
+import java.io.IOException;
+
+public interface FileSystemFactory {
+
+    public abstract FileSystem fileSystem(boolean initializeUGI) throws IOException;
+
+}