Skip to content

Commit

Permalink
Pinecone pipelines (#675)
Browse files Browse the repository at this point in the history
  • Loading branch information
pnadolny13 authored Aug 3, 2023
1 parent d79519a commit ac5a935
Show file tree
Hide file tree
Showing 10 changed files with 310 additions and 16 deletions.
4 changes: 4 additions & 0 deletions data/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,7 @@ xyz

# DynamoDB AWS Account
TAP_DYNAMODB_AWS_ASSUME_ROLE_ARN="arn:aws:iam::******"


TARGET_PINECONE_API_KEY=****
OPENAI_API_KEY=*****
37 changes: 36 additions & 1 deletion data/extract/extractors.meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ plugins:
start_date: '2020-01-01T00:00:00Z'
key_properties: [id]
name: azure_ips
pattern: ServiceTags_Public_20230710.json
pattern: ServiceTags_Public_20230724.json
json_path: values
- name: tap-slack
variant: meltanolabs
Expand Down Expand Up @@ -271,3 +271,38 @@ plugins:
streams:
- stream_name: animals
input_filename: https://raw.githubusercontent.com/meltano/tap-smoke-test/main/demo-data/animals-data.jsonl
- name: tap-beautifulsoup
variant: meltanolabs
pip_url: git+https://github.com/meltanolabs/tap-beautifulsoup.git
- name: tap-beautifulsoup-sdk
inherit_from: tap-beautifulsoup
config:
source_name: sdk-docs
site_url: https://sdk.meltano.com/en/latest/
output_folder: output
parser: html.parser
download_recursively: true
find_all_kwargs:
attrs:
role: main
- name: tap-beautifulsoup-edk
inherit_from: tap-beautifulsoup
config:
source_name: edk-docs
site_url: https://edk.meltano.com/en/latest/
output_folder: output
parser: html.parser
download_recursively: true
find_all_kwargs:
attrs:
role: main
- name: tap-beautifulsoup-meltano
inherit_from: tap-beautifulsoup
config:
source_name: meltano-docs
site_url: https://docs.meltano.com/
output_folder: output
parser: html.parser
download_recursively: true
find_all_kwargs:
text: true
15 changes: 0 additions & 15 deletions data/extract/mappers.meltano.yml

This file was deleted.

9 changes: 9 additions & 0 deletions data/load/loaders.meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,12 @@ plugins:
- columnObjectTypeId: 0-2
columnName: org_last_active_date
propertyName: telemetry__last_active_at
- name: target-pinecone
variant: meltanolabs
config:
index_name: target-pinecone-index
environment: asia-southeast1-gcp-free
document_text_property: page_content
embeddings_property: embeddings
metadata_property: metadata
pinecone_metadata_text_key: text
15 changes: 15 additions & 0 deletions data/mappers/clean_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import typing as t

from singer_sdk._singerlib.messages import (
Message,
)

class Mapper():

def map_record_message(self, message_dict: dict) -> t.Iterable[Message]:
page_content = message_dict["record"]["page_content"]
text_nl = " ".join(page_content.split("\n"))
text_spaces = " ".join(text_nl.split())
message_dict["record"]["page_content"] = text_spaces
return message_dict

32 changes: 32 additions & 0 deletions data/mappers/mappers.meltano.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
plugins:
mappers:
- name: meltano-map-transformer
variant: meltano
pip_url: git+https://github.com/MeltanoLabs/meltano-map-transform.git@v0.0.4
mappings:
- name: coalesce-gcp-ips
config:
stream_maps:
gcp_ips:
ipv4prefix:
ipv6prefix:
ipv4: record.get('ipv4prefix', '')
ipv6: record.get('ipv6prefix', '')
id: md5(record.get('ipv4prefix', record.get('ipv6prefix')))
- name: map-gpt-embeddings
namespace: map_gpt_embeddings
pip_url: git+https://github.com/MeltanoLabs/map-gpt-embeddings.git@tap_mapper
executable: map-gpt-embeddings
mappings:
- name: add-embeddings
config:
document_text_property: page_content
document_metadata_property: metadata
- name: mapper-generic
namespace: mapper_generic
pip_url: git+https://github.com/pnadolny13/mapper-generic.git
executable: mapper-generic
mappings:
- name: clean-text
config:
code_path: mappers/clean_text.py
1 change: 1 addition & 0 deletions data/meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ include_paths:
- ./orchestrate/*.meltano.yml
- ./transform/*.meltano.yml
- ./utilities/*.meltano.yml
- ./mappers/*.meltano.yml
8 changes: 8 additions & 0 deletions data/orchestrate/orchestrators.meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ schedules:
interval: 0 0 * * *
job: sample_job

- name: reload_pinecone
interval: 0 0 * * 0
job: reload_pinecone

jobs:

- name: dynanmodb_el
Expand Down Expand Up @@ -177,3 +181,7 @@ jobs:
- name: sample_job
tasks:
- tap-smoke-test target-jsonl

- name: reload_pinecone
tasks:
- tap-beautifulsoup-sdk clean-text add-embeddings target-pinecone
100 changes: 100 additions & 0 deletions data/plugins/extractors/tap-beautifulsoup--meltanolabs.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"plugin_type": "extractors",
"name": "tap-beautifulsoup",
"namespace": "tap_beautifulsoup",
"variant": "meltanolabs",
"label": "BeautifulSoup",
"docs": "https://hub.meltano.com/extractors/tap-beautifulsoup--meltanolabs",
"repo": "https://github.com/MeltanoLabs/tap-beautifulsoup",
"pip_url": "git+https://github.com/MeltanoLabs/tap-beautifulsoup.git",
"executable": "tap-beautifulsoup",
"description": "Python library for pulling data out of HTML and XML files.",
"logo_url": "https://hub.meltano.com/assets/logos/extractors/beautifulsoup.png",
"capabilities": [
"about",
"catalog",
"discover",
"schema-flattening",
"state",
"stream-maps"
],
"settings_group_validation": [
[
"output_folder",
"parser",
"site_url",
"source_name"
]
],
"settings": [
{
"name": "download_recursively",
"kind": "boolean",
"value": true,
"label": "Download Recursively",
"description": "Attempt to download all pages recursively into the output directory prior to parsing files. Set this to False if you've previously run `wget -r -A. Html https://sdk.meltano.com/en/latest/`"
},
{
"name": "find_all_kwargs",
"kind": "object",
"label": "Find All Kwargs",
"description": "This dict contains all the kwargs that should be passed to the [`find_all`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all) call in order to extract text from the pages."
},
{
"name": "flattening_enabled",
"kind": "boolean",
"label": "Flattening Enabled",
"description": "'True' to enable schema flattening and automatically expand nested properties."
},
{
"name": "flattening_max_depth",
"kind": "integer",
"label": "Flattening Max Depth",
"description": "The max depth to flatten schemas."
},
{
"name": "output_folder",
"kind": "string",
"value": "output",
"label": "Output Folder",
"description": "The file path of where to write the intermediate downloaded HTML files to."
},
{
"name": "parser",
"kind": "options",
"value": "html.parser",
"label": "Parser",
"description": "The BeautifulSoup parser to use.",
"options": [
{
"label": "Html Parser",
"value": "html.parser"
}
]
},
{
"name": "site_url",
"kind": "string",
"label": "Site URL",
"description": "The site you'd like to scrape. The tap will download all pages recursively into the output directory prior to parsing files."
},
{
"name": "source_name",
"kind": "string",
"label": "Source Name",
"description": "The name of the source you're scraping. This will be used as the stream name."
},
{
"name": "stream_map_config",
"kind": "object",
"label": "Stream Map Config",
"description": "User-defined config values to be used within map expressions."
},
{
"name": "stream_maps",
"kind": "object",
"label": "Stream Maps",
"description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)."
}
]
}
105 changes: 105 additions & 0 deletions data/plugins/loaders/target-pinecone--meltanolabs.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"plugin_type": "loaders",
"name": "target-pinecone",
"namespace": "target_pinecone",
"variant": "meltanolabs",
"label": "Pinecone",
"docs": "https://hub.meltano.com/loaders/target-pinecone--meltanolabs",
"repo": "https://github.com/MeltanoLabs/target-pinecone",
"pip_url": "git+https://github.com/MeltanoLabs/target-pinecone.git",
"executable": "target-pinecone",
"description": "Vector Database for Vector Search",
"logo_url": "https://hub.meltano.com/assets/logos/loaders/pinecone.png",
"capabilities": [
"about",
"schema-flattening",
"stream-maps"
],
"settings_group_validation": [
[
"api_key",
"document_text_property",
"index_name",
"pinecone_metadata_text_key"
]
],
"settings": [
{
"name": "api_key",
"kind": "password",
"label": "API Key",
"description": "Your Pinecone API key."
},
{
"name": "dimensions",
"kind": "integer",
"value": 1536,
"label": "Dimensions",
"description": "The amount of dimensions to use if creating a new index. An index is only created if it doesn't already exist. The default is `1536` which is the dimensions of the embeddings using OpenAI's text-embedding-ada-002 model."
},
{
"name": "document_text_property",
"kind": "string",
"value": "text",
"label": "Document Text Property",
"description": "The property containing the document text in the input records."
},
{
"name": "embeddings_property",
"kind": "string",
"value": "embeddings",
"label": "Embeddings Property",
"description": "The property containing the embeddings in the input records."
},
{
"name": "environment",
"kind": "string",
"label": "Environment",
"description": "Your Pinecone index name to write data to."
},
{
"name": "flattening_enabled",
"kind": "boolean",
"label": "Flattening Enabled",
"description": "'True' to enable schema flattening and automatically expand nested properties."
},
{
"name": "flattening_max_depth",
"kind": "integer",
"label": "Flattening Max Depth",
"description": "The max depth to flatten schemas."
},
{
"name": "index_name",
"kind": "string",
"label": "Index Name",
"description": "Your Pinecone index name to write data to."
},
{
"name": "metadata_property",
"kind": "string",
"value": "metadata",
"label": "Metadata Property",
"description": "The property containing the document metadata in the input records."
},
{
"name": "pinecone_metadata_text_key",
"kind": "password",
"value": "text",
"label": "Pinecone Metadata Text Key",
"description": "The key in the Pinecone metadata entry that will contain the text document."
},
{
"name": "stream_map_config",
"kind": "object",
"label": "Stream Map Config",
"description": "User-defined config values to be used within map expressions."
},
{
"name": "stream_maps",
"kind": "object",
"label": "Stream Maps",
"description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)."
}
]
}

0 comments on commit ac5a935

Please sign in to comment.