diff --git a/data/.env.template b/data/.env.template index c2a51198..89f8e5e6 100644 --- a/data/.env.template +++ b/data/.env.template @@ -35,3 +35,7 @@ xyz # DynamoDB AWS Account TAP_DYNAMODB_AWS_ASSUME_ROLE_ARN="arn:aws:iam::******" + + +TARGET_PINECONE_API_KEY=**** +OPENAI_API_KEY=***** diff --git a/data/extract/extractors.meltano.yml b/data/extract/extractors.meltano.yml index 2faeb35e..b444dcea 100644 --- a/data/extract/extractors.meltano.yml +++ b/data/extract/extractors.meltano.yml @@ -74,7 +74,7 @@ plugins: start_date: '2020-01-01T00:00:00Z' key_properties: [id] name: azure_ips - pattern: ServiceTags_Public_20230710.json + pattern: ServiceTags_Public_20230724.json json_path: values - name: tap-slack variant: meltanolabs @@ -271,3 +271,38 @@ plugins: streams: - stream_name: animals input_filename: https://raw.githubusercontent.com/meltano/tap-smoke-test/main/demo-data/animals-data.jsonl + - name: tap-beautifulsoup + variant: meltanolabs + pip_url: git+https://github.com/meltanolabs/tap-beautifulsoup.git + - name: tap-beautifulsoup-sdk + inherit_from: tap-beautifulsoup + config: + source_name: sdk-docs + site_url: https://sdk.meltano.com/en/latest/ + output_folder: output + parser: html.parser + download_recursively: true + find_all_kwargs: + attrs: + role: main + - name: tap-beautifulsoup-edk + inherit_from: tap-beautifulsoup + config: + source_name: edk-docs + site_url: https://edk.meltano.com/en/latest/ + output_folder: output + parser: html.parser + download_recursively: true + find_all_kwargs: + attrs: + role: main + - name: tap-beautifulsoup-meltano + inherit_from: tap-beautifulsoup + config: + source_name: meltano-docs + site_url: https://docs.meltano.com/ + output_folder: output + parser: html.parser + download_recursively: true + find_all_kwargs: + text: true \ No newline at end of file diff --git a/data/extract/mappers.meltano.yml b/data/extract/mappers.meltano.yml deleted file mode 100644 index 2103fe68..00000000 --- a/data/extract/mappers.meltano.yml +++ /dev/null @@ -1,15 +0,0 @@ -plugins: - mappers: - - name: meltano-map-transformer - variant: meltano - pip_url: git+https://github.com/MeltanoLabs/meltano-map-transform.git@v0.0.4 - mappings: - - name: coalesce-gcp-ips - config: - stream_maps: - gcp_ips: - ipv4prefix: - ipv6prefix: - ipv4: record.get('ipv4prefix', '') - ipv6: record.get('ipv6prefix', '') - id: md5(record.get('ipv4prefix', record.get('ipv6prefix'))) diff --git a/data/load/loaders.meltano.yml b/data/load/loaders.meltano.yml index 12f2e061..13a5c991 100644 --- a/data/load/loaders.meltano.yml +++ b/data/load/loaders.meltano.yml @@ -87,3 +87,12 @@ plugins: - columnObjectTypeId: 0-2 columnName: org_last_active_date propertyName: telemetry__last_active_at + - name: target-pinecone + variant: meltanolabs + config: + index_name: target-pinecone-index + environment: asia-southeast1-gcp-free + document_text_property: page_content + embeddings_property: embeddings + metadata_property: metadata + pinecone_metadata_text_key: text diff --git a/data/mappers/clean_text.py b/data/mappers/clean_text.py new file mode 100644 index 00000000..8917eee8 --- /dev/null +++ b/data/mappers/clean_text.py @@ -0,0 +1,15 @@ +import typing as t + +from singer_sdk._singerlib.messages import ( + Message, +) + +class Mapper(): + + def map_record_message(self, message_dict: dict) -> t.Iterable[Message]: + page_content = message_dict["record"]["page_content"] + text_nl = " ".join(page_content.split("\n")) + text_spaces = " ".join(text_nl.split()) + message_dict["record"]["page_content"] = text_spaces + return message_dict + diff --git a/data/mappers/mappers.meltano.yml b/data/mappers/mappers.meltano.yml new file mode 100644 index 00000000..3e238384 --- /dev/null +++ b/data/mappers/mappers.meltano.yml @@ -0,0 +1,32 @@ +plugins: + mappers: + - name: meltano-map-transformer + variant: meltano + pip_url: git+https://github.com/MeltanoLabs/meltano-map-transform.git@v0.0.4 + mappings: + - name: coalesce-gcp-ips + config: + stream_maps: + gcp_ips: + ipv4prefix: + ipv6prefix: + ipv4: record.get('ipv4prefix', '') + ipv6: record.get('ipv6prefix', '') + id: md5(record.get('ipv4prefix', record.get('ipv6prefix'))) + - name: map-gpt-embeddings + namespace: map_gpt_embeddings + pip_url: git+https://github.com/MeltanoLabs/map-gpt-embeddings.git@tap_mapper + executable: map-gpt-embeddings + mappings: + - name: add-embeddings + config: + document_text_property: page_content + document_metadata_property: metadata + - name: mapper-generic + namespace: mapper_generic + pip_url: git+https://github.com/pnadolny13/mapper-generic.git + executable: mapper-generic + mappings: + - name: clean-text + config: + code_path: mappers/clean_text.py diff --git a/data/meltano.yml b/data/meltano.yml index 6122bd80..31d60c65 100644 --- a/data/meltano.yml +++ b/data/meltano.yml @@ -9,3 +9,4 @@ include_paths: - ./orchestrate/*.meltano.yml - ./transform/*.meltano.yml - ./utilities/*.meltano.yml +- ./mappers/*.meltano.yml diff --git a/data/orchestrate/orchestrators.meltano.yml b/data/orchestrate/orchestrators.meltano.yml index 2118f238..fdb9bbf8 100644 --- a/data/orchestrate/orchestrators.meltano.yml +++ b/data/orchestrate/orchestrators.meltano.yml @@ -71,6 +71,10 @@ schedules: interval: 0 0 * * * job: sample_job +- name: reload_pinecone + interval: 0 0 * * 0 + job: reload_pinecone + jobs: - name: dynanmodb_el @@ -177,3 +181,7 @@ jobs: - name: sample_job tasks: - tap-smoke-test target-jsonl + +- name: reload_pinecone + tasks: + - tap-beautifulsoup-sdk clean-text add-embeddings target-pinecone diff --git a/data/plugins/extractors/tap-beautifulsoup--meltanolabs.lock b/data/plugins/extractors/tap-beautifulsoup--meltanolabs.lock new file mode 100644 index 00000000..75ab25fd --- /dev/null +++ b/data/plugins/extractors/tap-beautifulsoup--meltanolabs.lock @@ -0,0 +1,100 @@ +{ + "plugin_type": "extractors", + "name": "tap-beautifulsoup", + "namespace": "tap_beautifulsoup", + "variant": "meltanolabs", + "label": "BeautifulSoup", + "docs": "https://hub.meltano.com/extractors/tap-beautifulsoup--meltanolabs", + "repo": "https://github.com/MeltanoLabs/tap-beautifulsoup", + "pip_url": "git+https://github.com/MeltanoLabs/tap-beautifulsoup.git", + "executable": "tap-beautifulsoup", + "description": "Python library for pulling data out of HTML and XML files.", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/beautifulsoup.png", + "capabilities": [ + "about", + "catalog", + "discover", + "schema-flattening", + "state", + "stream-maps" + ], + "settings_group_validation": [ + [ + "output_folder", + "parser", + "site_url", + "source_name" + ] + ], + "settings": [ + { + "name": "download_recursively", + "kind": "boolean", + "value": true, + "label": "Download Recursively", + "description": "Attempt to download all pages recursively into the output directory prior to parsing files. Set this to False if you've previously run `wget -r -A. Html https://sdk.meltano.com/en/latest/`" + }, + { + "name": "find_all_kwargs", + "kind": "object", + "label": "Find All Kwargs", + "description": "This dict contains all the kwargs that should be passed to the [`find_all`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all) call in order to extract text from the pages." + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "output_folder", + "kind": "string", + "value": "output", + "label": "Output Folder", + "description": "The file path of where to write the intermediate downloaded HTML files to." + }, + { + "name": "parser", + "kind": "options", + "value": "html.parser", + "label": "Parser", + "description": "The BeautifulSoup parser to use.", + "options": [ + { + "label": "Html Parser", + "value": "html.parser" + } + ] + }, + { + "name": "site_url", + "kind": "string", + "label": "Site URL", + "description": "The site you'd like to scrape. The tap will download all pages recursively into the output directory prior to parsing files." + }, + { + "name": "source_name", + "kind": "string", + "label": "Source Name", + "description": "The name of the source you're scraping. This will be used as the stream name." + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config", + "description": "User-defined config values to be used within map expressions." + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps", + "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." + } + ] +} \ No newline at end of file diff --git a/data/plugins/loaders/target-pinecone--meltanolabs.lock b/data/plugins/loaders/target-pinecone--meltanolabs.lock new file mode 100644 index 00000000..4cba8d25 --- /dev/null +++ b/data/plugins/loaders/target-pinecone--meltanolabs.lock @@ -0,0 +1,105 @@ +{ + "plugin_type": "loaders", + "name": "target-pinecone", + "namespace": "target_pinecone", + "variant": "meltanolabs", + "label": "Pinecone", + "docs": "https://hub.meltano.com/loaders/target-pinecone--meltanolabs", + "repo": "https://github.com/MeltanoLabs/target-pinecone", + "pip_url": "git+https://github.com/MeltanoLabs/target-pinecone.git", + "executable": "target-pinecone", + "description": "Vector Database for Vector Search", + "logo_url": "https://hub.meltano.com/assets/logos/loaders/pinecone.png", + "capabilities": [ + "about", + "schema-flattening", + "stream-maps" + ], + "settings_group_validation": [ + [ + "api_key", + "document_text_property", + "index_name", + "pinecone_metadata_text_key" + ] + ], + "settings": [ + { + "name": "api_key", + "kind": "password", + "label": "API Key", + "description": "Your Pinecone API key." + }, + { + "name": "dimensions", + "kind": "integer", + "value": 1536, + "label": "Dimensions", + "description": "The amount of dimensions to use if creating a new index. An index is only created if it doesn't already exist. The default is `1536` which is the dimensions of the embeddings using OpenAI's text-embedding-ada-002 model." + }, + { + "name": "document_text_property", + "kind": "string", + "value": "text", + "label": "Document Text Property", + "description": "The property containing the document text in the input records." + }, + { + "name": "embeddings_property", + "kind": "string", + "value": "embeddings", + "label": "Embeddings Property", + "description": "The property containing the embeddings in the input records." + }, + { + "name": "environment", + "kind": "string", + "label": "Environment", + "description": "Your Pinecone index name to write data to." + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "index_name", + "kind": "string", + "label": "Index Name", + "description": "Your Pinecone index name to write data to." + }, + { + "name": "metadata_property", + "kind": "string", + "value": "metadata", + "label": "Metadata Property", + "description": "The property containing the document metadata in the input records." + }, + { + "name": "pinecone_metadata_text_key", + "kind": "password", + "value": "text", + "label": "Pinecone Metadata Text Key", + "description": "The key in the Pinecone metadata entry that will contain the text document." + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config", + "description": "User-defined config values to be used within map expressions." + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps", + "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." + } + ] +} \ No newline at end of file