Skip to content

Commit 4c743e1

Browse files
committed
Add GoogleFetcher
This allows the fetching of items using files.get from Google Drive
1 parent e2b41ec commit 4c743e1

File tree

5 files changed

+401
-1
lines changed

5 files changed

+401
-1
lines changed

tika-grpc/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,12 @@
223223
<artifactId>tika-fetcher-http</artifactId>
224224
<version>${project.version}</version>
225225
</dependency>
226+
<dependency>
227+
<dependency>
228+
<groupId>org.apache.tika</groupId>
229+
<artifactId>tika-fetcher-google</artifactId>
230+
<version>${project.version}</version>
231+
</dependency>
226232
<dependency>
227233
<groupId>com.fasterxml.jackson.module</groupId>
228234
<artifactId>jackson-module-jsonSchema</artifactId>

tika-pipes/tika-fetchers/pom.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
<module>tika-fetcher-gcs</module>
3838
<module>tika-fetcher-az-blob</module>
3939
<module>tika-fetcher-microsoft-graph</module>
40+
<module>tika-fetcher-google</module>
4041
</modules>
4142

4243
<dependencies>
@@ -45,4 +46,4 @@
4546
<scm>
4647
<tag>3.0.0-rc1</tag>
4748
</scm>
48-
</project>
49+
</project>
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one
4+
or more contributor license agreements. See the NOTICE file
5+
distributed with this work for additional information
6+
regarding copyright ownership. The ASF licenses this file
7+
to you under the Apache License, Version 2.0 (the
8+
"License"); you may not use this file except in compliance
9+
with the License. You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing,
14+
software distributed under the License is distributed on an
15+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
KIND, either express or implied. See the License for the
17+
specific language governing permissions and limitations
18+
under the License.
19+
-->
20+
<project xmlns="http://maven.apache.org/POM/4.0.0"
21+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
22+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
23+
<modelVersion>4.0.0</modelVersion>
24+
25+
<parent>
26+
<artifactId>tika-fetchers</artifactId>
27+
<groupId>org.apache.tika</groupId>
28+
<version>3.0.0-SNAPSHOT</version>
29+
</parent>
30+
31+
<artifactId>tika-fetcher-google</artifactId>
32+
<name>Google Tika Pipes Fetcher</name>
33+
34+
<properties>
35+
<google.api.client.version>2.2.0</google.api.client.version>
36+
<maven.compiler.source>11</maven.compiler.source>
37+
<maven.compiler.target>11</maven.compiler.target>
38+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
39+
<azure-identity.version>1.11.0</azure-identity.version>
40+
<microsoft-graph.version>6.4.0</microsoft-graph.version>
41+
<microsoft-kiota-serialization-json.version>1.1.1</microsoft-kiota-serialization-json.version>
42+
<junit-jupiter-engine.version>5.11.0-M2</junit-jupiter-engine.version>
43+
<wiremock.version>3.3.1</wiremock.version>
44+
<mockito-junit-jupiter.version>5.3.1</mockito-junit-jupiter.version>
45+
<nimbus-jose-jwt.version>9.37.3</nimbus-jose-jwt.version>
46+
</properties>
47+
48+
<dependencies>
49+
<!-- Apache Tika Core -->
50+
<dependency>
51+
<groupId>${project.groupId}</groupId>
52+
<artifactId>tika-core</artifactId>
53+
<version>${project.version}</version>
54+
</dependency>
55+
56+
<!-- Google Drive API Client -->
57+
<dependency>
58+
<groupId>com.google.api-client</groupId>
59+
<artifactId>google-api-client</artifactId>
60+
<version>${google.api.client.version}</version>
61+
</dependency>
62+
63+
<dependency>
64+
<groupId>com.google.auth</groupId>
65+
<artifactId>google-auth-library-oauth2-http</artifactId>
66+
<version>1.19.0</version>
67+
</dependency>
68+
69+
<!-- Google Drive API -->
70+
<dependency>
71+
<groupId>com.google.apis</groupId>
72+
<artifactId>google-api-services-drive</artifactId>
73+
<version>v3-rev20241027-2.0.0</version>
74+
</dependency>
75+
76+
<!-- Logging -->
77+
<dependency>
78+
<groupId>org.slf4j</groupId>
79+
<artifactId>slf4j-api</artifactId>
80+
</dependency>
81+
82+
<!-- Apache Commons IO -->
83+
<dependency>
84+
<groupId>commons-io</groupId>
85+
<artifactId>commons-io</artifactId>
86+
</dependency>
87+
88+
<!-- Test Dependencies -->
89+
<dependency>
90+
<groupId>org.junit.jupiter</groupId>
91+
<artifactId>junit-jupiter</artifactId>
92+
<scope>test</scope>
93+
</dependency>
94+
</dependencies>
95+
96+
<build>
97+
<plugins>
98+
<plugin>
99+
<groupId>org.apache.maven.plugins</groupId>
100+
<artifactId>maven-compiler-plugin</artifactId>
101+
<configuration>
102+
<archive>
103+
<manifestEntries>
104+
<Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name>
105+
</manifestEntries>
106+
</archive>
107+
</configuration>
108+
</plugin>
109+
</plugins>
110+
</build>
111+
112+
<scm>
113+
<tag>3.0.0-BETA-rc1</tag>
114+
</scm>
115+
</project>
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.pipes.fetchers.google;
18+
19+
import java.io.ByteArrayInputStream;
20+
import java.io.IOException;
21+
import java.io.InputStream;
22+
import java.nio.file.Files;
23+
import java.nio.file.Path;
24+
import java.util.ArrayList;
25+
import java.util.Base64;
26+
import java.util.List;
27+
import java.util.Map;
28+
29+
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
30+
import com.google.api.client.http.HttpRequestInitializer;
31+
import com.google.api.client.json.JsonFactory;
32+
import com.google.api.client.json.gson.GsonFactory;
33+
import com.google.api.services.drive.Drive;
34+
import com.google.api.services.drive.DriveScopes;
35+
import com.google.auth.http.HttpCredentialsAdapter;
36+
import com.google.auth.oauth2.GoogleCredentials;
37+
import org.slf4j.Logger;
38+
import org.slf4j.LoggerFactory;
39+
40+
import org.apache.tika.config.Field;
41+
import org.apache.tika.config.Initializable;
42+
import org.apache.tika.config.InitializableProblemHandler;
43+
import org.apache.tika.config.Param;
44+
import org.apache.tika.exception.TikaConfigException;
45+
import org.apache.tika.exception.TikaException;
46+
import org.apache.tika.io.TemporaryResources;
47+
import org.apache.tika.io.TikaInputStream;
48+
import org.apache.tika.metadata.Metadata;
49+
import org.apache.tika.parser.ParseContext;
50+
import org.apache.tika.pipes.fetcher.AbstractFetcher;
51+
import org.apache.tika.pipes.fetchers.google.config.GoogleDriveFetcherConfig;
52+
53+
54+
/**
55+
* GoogleDrive Fetcher allows the fetching of files from a Google Drive, using a
56+
* service account key.
57+
*
58+
* Fetch Keys are ${fileId},${subjectUser}, where the subject user is the
59+
* organizer of the file. This user is necessary as part of the key as the
60+
* service account must act on behalf of the user when querying for the file.
61+
*/
62+
public class GoogleDriveFetcher extends AbstractFetcher implements Initializable {
63+
private static final Logger LOGGER = LoggerFactory.getLogger(GoogleDriveFetcher.class);
64+
private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance();
65+
66+
private GoogleCredentials baseCredentials;
67+
68+
private Drive driveService;
69+
private boolean spoolToTemp;
70+
private List<String> scopes;
71+
72+
private GoogleDriveFetcherConfig config = new GoogleDriveFetcherConfig();
73+
74+
public GoogleDriveFetcher() {
75+
scopes = new ArrayList<>();
76+
scopes.add(DriveScopes.DRIVE_READONLY);
77+
}
78+
79+
public GoogleDriveFetcher(GoogleDriveFetcherConfig config) {
80+
this.config = config;
81+
}
82+
83+
@Field
84+
public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException {
85+
String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(",");
86+
long[] seconds = new long[longStrings.length];
87+
for (int i = 0; i < longStrings.length; i++) {
88+
try {
89+
seconds[i] = Long.parseLong(longStrings[i]);
90+
} catch (NumberFormatException e) {
91+
throw new TikaConfigException(e.getMessage());
92+
}
93+
}
94+
setThrottleSeconds(seconds);
95+
}
96+
97+
public void setThrottleSeconds(long[] throttleSeconds) {
98+
config.setThrottleSeconds(throttleSeconds);
99+
}
100+
101+
@Field
102+
public void setSpoolToTemp(boolean spoolToTemp) {
103+
config.setSpoolToTemp(spoolToTemp);
104+
}
105+
106+
@Field
107+
public void setServiceAccountKeyBase64(String serviceAccountKeyBase64) {
108+
config.setServiceAccountKeyBase64(serviceAccountKeyBase64);
109+
}
110+
111+
@Field
112+
public void setSubjectUser(String subjectUser) {
113+
config.setSubjectUser(subjectUser);
114+
}
115+
116+
@Field
117+
public void setScopes(List<String> scopes) {
118+
config.setScopes(new ArrayList<>(scopes));
119+
if (config.getScopes().isEmpty()) {
120+
config.getScopes().add(DriveScopes.DRIVE_READONLY);
121+
}
122+
}
123+
124+
@Override
125+
public void initialize(Map<String, Param> map) throws TikaConfigException {
126+
try {
127+
baseCredentials = GoogleCredentials
128+
.fromStream(new ByteArrayInputStream(Base64.getDecoder().decode(config.getServiceAccountKeyBase64())))
129+
.createScoped(scopes);
130+
} catch (IOException e) {
131+
throw new TikaConfigException("Failed to initialize Google Drive service", e);
132+
}
133+
}
134+
135+
@Override
136+
public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException {
137+
}
138+
139+
@Override
140+
public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException {
141+
int tries = 0;
142+
Exception ex = null;
143+
TemporaryResources tmp = null;
144+
145+
do {
146+
long start = System.currentTimeMillis();
147+
try {
148+
String[] fetchKeySplit = fetchKey.split(",");
149+
if (fetchKeySplit.length != 2) {
150+
throw new TikaException("Invalid fetch key, expected format ${fileId},${subjectUser}: " + fetchKey);
151+
}
152+
153+
String fileId = fetchKeySplit[0];
154+
String subjectUser = fetchKeySplit[1];
155+
156+
GoogleCredentials delegatedCredentials = baseCredentials.createDelegated(subjectUser);
157+
final HttpRequestInitializer requestInitializer = new HttpCredentialsAdapter(delegatedCredentials);
158+
159+
driveService = new Drive.Builder(
160+
GoogleNetHttpTransport.newTrustedTransport(),
161+
JSON_FACTORY,
162+
requestInitializer).setApplicationName("tika-fetcher-google").build();
163+
164+
InputStream is = driveService.files()
165+
.get(fileId)
166+
.executeMediaAsInputStream();
167+
168+
if (is == null) {
169+
throw new IOException("Empty input stream when we tried to parse " + fetchKey);
170+
}
171+
172+
if (spoolToTemp) {
173+
tmp = new TemporaryResources();
174+
Path tmpPath = tmp.createTempFile(fileId + ".dat");
175+
Files.copy(is, tmpPath);
176+
return TikaInputStream.get(tmpPath);
177+
}
178+
return TikaInputStream.get(is);
179+
180+
} catch (Exception e) {
181+
LOGGER.warn("Exception fetching on retry=" + tries, e);
182+
ex = e;
183+
} finally {
184+
long elapsed = System.currentTimeMillis() - start;
185+
LOGGER.debug("Total to fetch {}", elapsed);
186+
}
187+
188+
long[] throttleSeconds = config.getThrottleSeconds();
189+
190+
LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]);
191+
try {
192+
Thread.sleep(throttleSeconds[tries] * 1000);
193+
} catch (InterruptedException e) {
194+
Thread.currentThread().interrupt();
195+
}
196+
} while (++tries < config.getThrottleSeconds().length);
197+
198+
throw new TikaException("Could not fetch " + fetchKey, ex);
199+
}
200+
}

0 commit comments

Comments
 (0)