-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d79519a
commit ac5a935
Showing
10 changed files
with
310 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import typing as t | ||
|
||
from singer_sdk._singerlib.messages import ( | ||
Message, | ||
) | ||
|
||
class Mapper(): | ||
|
||
def map_record_message(self, message_dict: dict) -> t.Iterable[Message]: | ||
page_content = message_dict["record"]["page_content"] | ||
text_nl = " ".join(page_content.split("\n")) | ||
text_spaces = " ".join(text_nl.split()) | ||
message_dict["record"]["page_content"] = text_spaces | ||
return message_dict | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
plugins: | ||
mappers: | ||
- name: meltano-map-transformer | ||
variant: meltano | ||
pip_url: git+https://github.com/MeltanoLabs/meltano-map-transform.git@v0.0.4 | ||
mappings: | ||
- name: coalesce-gcp-ips | ||
config: | ||
stream_maps: | ||
gcp_ips: | ||
ipv4prefix: | ||
ipv6prefix: | ||
ipv4: record.get('ipv4prefix', '') | ||
ipv6: record.get('ipv6prefix', '') | ||
id: md5(record.get('ipv4prefix', record.get('ipv6prefix'))) | ||
- name: map-gpt-embeddings | ||
namespace: map_gpt_embeddings | ||
pip_url: git+https://github.com/MeltanoLabs/map-gpt-embeddings.git@tap_mapper | ||
executable: map-gpt-embeddings | ||
mappings: | ||
- name: add-embeddings | ||
config: | ||
document_text_property: page_content | ||
document_metadata_property: metadata | ||
- name: mapper-generic | ||
namespace: mapper_generic | ||
pip_url: git+https://github.com/pnadolny13/mapper-generic.git | ||
executable: mapper-generic | ||
mappings: | ||
- name: clean-text | ||
config: | ||
code_path: mappers/clean_text.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
100 changes: 100 additions & 0 deletions
100
data/plugins/extractors/tap-beautifulsoup--meltanolabs.lock
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
{ | ||
"plugin_type": "extractors", | ||
"name": "tap-beautifulsoup", | ||
"namespace": "tap_beautifulsoup", | ||
"variant": "meltanolabs", | ||
"label": "BeautifulSoup", | ||
"docs": "https://hub.meltano.com/extractors/tap-beautifulsoup--meltanolabs", | ||
"repo": "https://github.com/MeltanoLabs/tap-beautifulsoup", | ||
"pip_url": "git+https://github.com/MeltanoLabs/tap-beautifulsoup.git", | ||
"executable": "tap-beautifulsoup", | ||
"description": "Python library for pulling data out of HTML and XML files.", | ||
"logo_url": "https://hub.meltano.com/assets/logos/extractors/beautifulsoup.png", | ||
"capabilities": [ | ||
"about", | ||
"catalog", | ||
"discover", | ||
"schema-flattening", | ||
"state", | ||
"stream-maps" | ||
], | ||
"settings_group_validation": [ | ||
[ | ||
"output_folder", | ||
"parser", | ||
"site_url", | ||
"source_name" | ||
] | ||
], | ||
"settings": [ | ||
{ | ||
"name": "download_recursively", | ||
"kind": "boolean", | ||
"value": true, | ||
"label": "Download Recursively", | ||
"description": "Attempt to download all pages recursively into the output directory prior to parsing files. Set this to False if you've previously run `wget -r -A. Html https://sdk.meltano.com/en/latest/`" | ||
}, | ||
{ | ||
"name": "find_all_kwargs", | ||
"kind": "object", | ||
"label": "Find All Kwargs", | ||
"description": "This dict contains all the kwargs that should be passed to the [`find_all`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all) call in order to extract text from the pages." | ||
}, | ||
{ | ||
"name": "flattening_enabled", | ||
"kind": "boolean", | ||
"label": "Flattening Enabled", | ||
"description": "'True' to enable schema flattening and automatically expand nested properties." | ||
}, | ||
{ | ||
"name": "flattening_max_depth", | ||
"kind": "integer", | ||
"label": "Flattening Max Depth", | ||
"description": "The max depth to flatten schemas." | ||
}, | ||
{ | ||
"name": "output_folder", | ||
"kind": "string", | ||
"value": "output", | ||
"label": "Output Folder", | ||
"description": "The file path of where to write the intermediate downloaded HTML files to." | ||
}, | ||
{ | ||
"name": "parser", | ||
"kind": "options", | ||
"value": "html.parser", | ||
"label": "Parser", | ||
"description": "The BeautifulSoup parser to use.", | ||
"options": [ | ||
{ | ||
"label": "Html Parser", | ||
"value": "html.parser" | ||
} | ||
] | ||
}, | ||
{ | ||
"name": "site_url", | ||
"kind": "string", | ||
"label": "Site URL", | ||
"description": "The site you'd like to scrape. The tap will download all pages recursively into the output directory prior to parsing files." | ||
}, | ||
{ | ||
"name": "source_name", | ||
"kind": "string", | ||
"label": "Source Name", | ||
"description": "The name of the source you're scraping. This will be used as the stream name." | ||
}, | ||
{ | ||
"name": "stream_map_config", | ||
"kind": "object", | ||
"label": "Stream Map Config", | ||
"description": "User-defined config values to be used within map expressions." | ||
}, | ||
{ | ||
"name": "stream_maps", | ||
"kind": "object", | ||
"label": "Stream Maps", | ||
"description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
{ | ||
"plugin_type": "loaders", | ||
"name": "target-pinecone", | ||
"namespace": "target_pinecone", | ||
"variant": "meltanolabs", | ||
"label": "Pinecone", | ||
"docs": "https://hub.meltano.com/loaders/target-pinecone--meltanolabs", | ||
"repo": "https://github.com/MeltanoLabs/target-pinecone", | ||
"pip_url": "git+https://github.com/MeltanoLabs/target-pinecone.git", | ||
"executable": "target-pinecone", | ||
"description": "Vector Database for Vector Search", | ||
"logo_url": "https://hub.meltano.com/assets/logos/loaders/pinecone.png", | ||
"capabilities": [ | ||
"about", | ||
"schema-flattening", | ||
"stream-maps" | ||
], | ||
"settings_group_validation": [ | ||
[ | ||
"api_key", | ||
"document_text_property", | ||
"index_name", | ||
"pinecone_metadata_text_key" | ||
] | ||
], | ||
"settings": [ | ||
{ | ||
"name": "api_key", | ||
"kind": "password", | ||
"label": "API Key", | ||
"description": "Your Pinecone API key." | ||
}, | ||
{ | ||
"name": "dimensions", | ||
"kind": "integer", | ||
"value": 1536, | ||
"label": "Dimensions", | ||
"description": "The amount of dimensions to use if creating a new index. An index is only created if it doesn't already exist. The default is `1536` which is the dimensions of the embeddings using OpenAI's text-embedding-ada-002 model." | ||
}, | ||
{ | ||
"name": "document_text_property", | ||
"kind": "string", | ||
"value": "text", | ||
"label": "Document Text Property", | ||
"description": "The property containing the document text in the input records." | ||
}, | ||
{ | ||
"name": "embeddings_property", | ||
"kind": "string", | ||
"value": "embeddings", | ||
"label": "Embeddings Property", | ||
"description": "The property containing the embeddings in the input records." | ||
}, | ||
{ | ||
"name": "environment", | ||
"kind": "string", | ||
"label": "Environment", | ||
"description": "Your Pinecone index name to write data to." | ||
}, | ||
{ | ||
"name": "flattening_enabled", | ||
"kind": "boolean", | ||
"label": "Flattening Enabled", | ||
"description": "'True' to enable schema flattening and automatically expand nested properties." | ||
}, | ||
{ | ||
"name": "flattening_max_depth", | ||
"kind": "integer", | ||
"label": "Flattening Max Depth", | ||
"description": "The max depth to flatten schemas." | ||
}, | ||
{ | ||
"name": "index_name", | ||
"kind": "string", | ||
"label": "Index Name", | ||
"description": "Your Pinecone index name to write data to." | ||
}, | ||
{ | ||
"name": "metadata_property", | ||
"kind": "string", | ||
"value": "metadata", | ||
"label": "Metadata Property", | ||
"description": "The property containing the document metadata in the input records." | ||
}, | ||
{ | ||
"name": "pinecone_metadata_text_key", | ||
"kind": "password", | ||
"value": "text", | ||
"label": "Pinecone Metadata Text Key", | ||
"description": "The key in the Pinecone metadata entry that will contain the text document." | ||
}, | ||
{ | ||
"name": "stream_map_config", | ||
"kind": "object", | ||
"label": "Stream Map Config", | ||
"description": "User-defined config values to be used within map expressions." | ||
}, | ||
{ | ||
"name": "stream_maps", | ||
"kind": "object", | ||
"label": "Stream Maps", | ||
"description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." | ||
} | ||
] | ||
} |