GoogleCloudDataproc · jayadeep-jayaraman · Nov 27, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ Prerequisites:
 
 ```
 git clone https://github.com/GoogleCloudDataproc/flink-bigquery-connector
-cd flink-connector-bigquery
+cd flink-bigquery-connector
 mvn clean package -DskipTests
 ```
 

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
@@ -13,27 +13,15 @@ steps:
     env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 3. Run unit tests
+# 3. Run unit & integration tests
   - name: 'gcr.io/$PROJECT_ID/dataproc-flink-bigquery-connector-presubmit'
     id: 'unit-tests'
     waitFor: ['init']
     entrypoint: 'bash'
-    args: ['/workspace/cloudbuild/presubmit.sh', 'unittest']
+    args: ['/workspace/cloudbuild/presubmit.sh', 'tests']
     env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 4. Run integration tests concurrently with unit tests
-# Commeneted out until integration tests are ported
-#  - name: 'gcr.io/$PROJECT_ID/dataproc-flink-bigquery-connector-presubmit'
-#    id: 'integration-tests'
-#    waitFor: ['unit-tests']
-#    entrypoint: 'bash'
-#    args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest']
-#    env:
-#    - 'GOOGLE_CLOUD_PROJECT=${_GOOGLE_CLOUD_PROJECT}'
-#    - 'TEMPORARY_GCS_BUCKET=${_TEMPORARY_GCS_BUCKET}'
-#    - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
-
 # Tests take around 20 mins in general.
 timeout: 1800s
 

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
@@ -30,18 +30,13 @@ cd /workspace
 case $STEP in
   # Download maven and all the dependencies
   init)
-    $MVN install -DskipTests
+    $MVN clean install -DskipTests
     exit
     ;;
 
-  # Run unit tests
-  unittest)
-    $MVN test jacoco:report jacoco:report-aggregate 
-    ;;
-
-  # Run integration tests
-  integrationtest)
-    $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate 
+  # Run unit & integration tests
+  tests)
+    $MVN clean clover:setup verify clover:aggregate clover:clover -Pclover -pl flink-connector-bigquery
     ;;
 
   *)

diff --git a/flink-connector-bigquery-examples/pom.xml b/flink-connector-bigquery-examples/pom.xml
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.flink</groupId>
+        <artifactId>flink-connector-bigquery-parent</artifactId>
+        <version>1.1-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>flink-connector-bigquery-examples</artifactId>
+    <name>Flink : Connectors : Google BigQuery Examples</name>
+    <packaging>jar</packaging>
+
+    <properties>
+        <japicmp.skip>true</japicmp.skip>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-connector-bigquery</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.flink</groupId>
+            <artifactId>flink-streaming-java</artifactId>
+            <version>${flink.version}</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <finalName>BigQueryExample</finalName>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <configuration>
+                    <skip>true</skip>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>shade-flink</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <shadedArtifactAttached>false</shadedArtifactAttached>
+                            <createDependencyReducedPom>false</createDependencyReducedPom>
+                            <resources>
+                                <resource>
+                                    <directory>src/test/resources</directory>
+                                </resource>
+                            </resources>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>org.apache.flink.examples.gcp.bigquery.BigQueryExample</mainClass>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>log4j2-bqexample.properties</resource>
+                                    <file>src/main/resources/log4j2-bqexample.properties</file>
+                                </transformer>
+                            </transformers>
+                            <artifactSet>
+                                <includes>
+                                    <include>org.apache.flink:flink-connector-bigquery</include>
+                                    <include>org.apache.flink:flink-avro</include>
+                                    <include>org.apache.flink:flink-metrics-dropwizard</include>
+                                    <include>com.google.cloud:google-cloud-bigquerystorage</include>
+                                    <include>com.google.*:*</include>
+                                    <include>commons-codec:commons-codec</include>    
+                                    <include>dev.failsafe:*</include>                                
+                                    <include>org.apache.avro:*</include>
+                                    <include>org.apache.httpcomponents:*</include>
+                                    <include>org.codehaus.mojo:animal-sniffer-annotations</include>
+                                    <include>org.conscrypt:*</include>
+                                    <include>com.fasterxml.jackson.*:*</include>
+                                    <include>org.threeten:*</include>
+                                    <include>org.checkerframework:*</include>
+                                    <include>io.dropwizard.metrics:*</include>
+                                    <include>io.grpc:*</include>
+                                    <include>io.opencensus:*</include>
+                                    <include>io.perfmark:*</include>
+                                </includes>
+                            </artifactSet>
+                            <relocations>
+                                <relocation>
+                                    <pattern>com.google</pattern>
+                                    <shadedPattern>org.apache.flink.examples.gcp.bigquery.shaded.com.google</shadedPattern>
+                                </relocation>
+                            </relocations>
+
+                            <filters>
+                                <filter>
+                                    <artifact>org.apache.flink:flink-connector-bigquery-examples*</artifact>
+                                    <includes>
+                                        <include>org/apache/flink/examples/gcp/bigquery/**</include>
+                                    </includes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/...gquery-examples/src/main/java/org/apache/flink/examples/gcp/bigquery/BigQueryExample.java b/...gquery-examples/src/main/java/org/apache/flink/examples/gcp/bigquery/BigQueryExample.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.examples.gcp.bigquery;
+
+import org.apache.flink.api.common.eventtime.WatermarkStrategy;
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.api.java.utils.ParameterTool;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.util.Collector;
+
+import com.google.cloud.flink.bigquery.common.config.BigQueryConnectOptions;
+import com.google.cloud.flink.bigquery.source.BigQuerySource;
+import com.google.cloud.flink.bigquery.source.config.BigQueryReadOptions;
+import org.apache.avro.generic.GenericRecord;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A simple BigQuery table read example with Flink's DataStream API.
+ *
+ * <p>The Flink pipeline will try to read the specified BigQuery table, potentially limiting the
+ * element count to the specified row restriction and limit count, returning {@link GenericRecord}
+ * representing the rows, to finally prints out some aggregated values given the provided payload's
+ * field.
+ *
+ * <p>Note on row restriction: In case of including a restriction with a temporal reference,
+ * something like {@code "TIMESTAMP_TRUNC(ingestion_timestamp, HOUR) = '2023-06-20 19:00:00'"}, and
+ * launching the job from Flink's Rest API is known the single quotes are not supported and will
+ * make the pipeline fail. As a workaround for that case using \u0027 as a replacement will make it
+ * work, example {@code "TIMESTAMP_TRUNC(ingestion_timestamp, HOUR) = \u00272023-06-20
+ * 19:00:00\u0027"}.
+ */
+public class BigQueryExample {
+
+    private static final Logger LOG = LoggerFactory.getLogger(BigQueryExample.class);
+
+    public static void main(String[] args) throws Exception {
+        // parse input arguments
+        final ParameterTool parameterTool = ParameterTool.fromArgs(args);
+
+        if (parameterTool.getNumberOfParameters() < 4) {
+            LOG.error(
+                    "Missing parameters!\n"
+                            + "Usage: flink run <additional runtime params> BigQuery.jar"
+                            + " --gcp-project <gcp-project> --bq-dataset <dataset name>"
+                            + " --bq-table <table name> --agg-prop <payload's property>"
+                            + " --restriction <single-quoted string with row predicate>"
+                            + " --limit <optional: limit records returned>");
+            return;
+        }
+
+        String projectName = parameterTool.getRequired("gcp-project");
+        String datasetName = parameterTool.getRequired("bq-dataset");
+        String tableName = parameterTool.getRequired("bq-table");
+        String rowRestriction = parameterTool.get("restriction", "").replace("\\u0027", "'");
+        Integer recordLimit = parameterTool.getInt("limit", -1);
+        String recordPropertyToAggregate = parameterTool.getRequired("agg-prop");
+
+        runFlinkJob(
+                projectName,
+                datasetName,
+                tableName,
+                recordPropertyToAggregate,
+                rowRestriction,
+                recordLimit);
+    }
+
+    private static void runFlinkJob(
+            String projectName,
+            String datasetName,
+            String tableName,
+            String recordPropertyToAggregate,
+            String rowRestriction,
+            Integer limit)
+            throws Exception {
+
+        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+        env.enableCheckpointing(60000L);
+
+        /**
+         * we will be reading avro generic records from BigQuery, and in this case we are assuming
+         * the GOOGLE_APPLICATION_CREDENTIALS env variable will be present in the execution runtime.
+         * In case of needing authenticate differently, the credentials builder (part of the
+         * BigQueryConnectOptions) should enable capturing the credentials from various sources.
+         */
+        BigQuerySource<GenericRecord> bqSource =
+                BigQuerySource.readAvros(
+                        BigQueryReadOptions.builder()
+                                .setBigQueryConnectOptions(
+                                        BigQueryConnectOptions.builder()
+                                                .setProjectId(projectName)
+                                                .setDataset(datasetName)
+                                                .setTable(tableName)
+                                                .build())
+                                .setRowRestriction(rowRestriction)
+                                .build(),
+                        limit);
+
+        env.fromSource(bqSource, WatermarkStrategy.noWatermarks(), "BigQuerySource")
+                .flatMap(new FlatMapper(recordPropertyToAggregate))
+                .keyBy(t -> t.f0)
+                .max("f1")
+                .print();
+
+        env.execute("Flink BigQuery Example");
+    }
+
+    static class FlatMapper implements FlatMapFunction<GenericRecord, Tuple2<String, Integer>> {
+
+        private final String recordPropertyToAggregate;
+
+        public FlatMapper(String recordPropertyToAggregate) {
+            this.recordPropertyToAggregate = recordPropertyToAggregate;
+        }
+
+        @Override
+        public void flatMap(GenericRecord record, Collector<Tuple2<String, Integer>> out)
+                throws Exception {
+            out.collect(
+                    Tuple2.<String, Integer>of(
+                            (String) record.get(recordPropertyToAggregate).toString(), 1));
+        }
+    }
+}
diff --git a/flink-connector-bigquery/pom.xml b/flink-connector-bigquery/pom.xml
@@ -162,58 +162,10 @@ under the License.
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-jar-plugin</artifactId>
-                <executions>
-                    <execution>
-                        <goals>
-                            <goal>test-jar</goal>
-                        </goals>
-                    </execution>
-                </executions>
             </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <executions>
-                    <execution>
-                        <id>default-test</id>
-                        <phase>test</phase>
-                        <goals>
-                            <goal>test</goal>
-                        </goals>
-                        <configuration>
-                            <argLine>${argLine} -XX:+UseG1GC -Xms256m -Xmx1024m</argLine>
-                        </configuration>
-                    </execution>
-                    <execution>
-                        <id>integration-tests</id>
-                        <phase>integration-test</phase>
-                        <goals>
-                            <goal>test</goal>
-                        </goals>
-                        <configuration>
-                            <argLine>-XX:+UseG1GC -Xms256m -Xmx2048m</argLine>
-                        </configuration>
-                    </execution>
-                </executions>
-            </plugin>
-            <plugin>
-                <groupId>org.jacoco</groupId>
-                <artifactId>jacoco-maven-plugin</artifactId>
-                <executions>
-                    <execution>
-                        <id>prepare-agent</id>
-                        <goals>
-                            <goal>prepare-agent</goal>
-                        </goals>
-                    </execution>
-                    <execution>
-                        <id>report</id>
-                        <phase>install</phase>
-                        <goals>
-                            <goal>report</goal>
-                        </goals>
-                    </execution>
-                </executions>
             </plugin>
         </plugins>
     </build>