One star knowing it all...
# HAVING DATA:
DATA = {
'a': 1.5,
'b': 1458266965.250572,
'c': [{'x': {'y': 'LT121000011101001000'}}, {'z': 'Omega'}]}
# GETTING KNOWLEDGE:
SCHEMA = {
'a': 'price#EUR|to.decimal',
'b': 'timestamp#date|to.unixtime',
'c': [{'*': 'contributions',
'x': {'*': 'origins', 'y': 'account#IBAN|to.string'},
'z': 'company#name|to.string'}],
}
# NORMALIZATION
metaform.load(DATA).format(SCHEMA)
# OR USE '*' TO REFERENCE SCHEMA TO PACK SCHEMA WITH DATA PACKETS:
metaform.load(dict(DATA, **{'*': 'https://github.com/wefindx/schema/wiki/Sale#test'})).format()
Metaform is a package for hierarchical and nested data normalization.
pip install metaform
import metaform
# INPUT
metaform.load({
'*': 'https://github.com/mindey/terms/wiki/person#foaf',
'url': 'http://dbpedia.org/resource/John_Lennon',
'fullname': 'John Lennon',
'birthdate': '1940-10-09',
'spouse': 'http://dbpedia.org/resource/Cynthia_Lennon'
}).format(refresh=True)
# (schemas are cached locally, pass refresh=True to redownload)
# OUTPUT
{
'*': 'GH:mindey/terms/person#foaf',
'jsonld:id': 'http://dbpedia.org/resource/John_Lennon',
'foaf:name': 'John Lennon',
'schema:birthDate': datetime.datetime(1940, 10, 9, 0, 0),
'schema:spouse': 'http://dbpedia.org/resource/Cynthia_Lennon'
}
data = metaform.load('https://www.metaculus.com/api2/questions/')
data['*'] ='https://github.com/wefindx/schema/wiki/Topic#metaculuscom-question'
data.format()
# Try it!
df = metaform.read_csv(
'https://gist.githubusercontent.com/mindey/3f2596e108a5c151f32e1967275a7689/raw/7c4c963219255008fdb438e8b9777cd658eea02e/hello-world.csv',
schema={
0: 'Timestamp|to.unixtime',
1: 'KeyUpOrDown|lambda x: x=="k↓" and "KeyDown" or (x=="k↑" and "KeyUp")',
2: 'KeyName'},
header=None
)
Alternatively, save schema to wiki like here, and include the schema token inside filename by encoding it as sub-extension, that is, rename hello-world.csv
to hello-world.GH~mindey+schema+KeyEvent@mykeylogger-01.csv
:
# To get schema token for filename (GH~mindey+schema+KeyEvent@mykeylogger-01) do:
metaform.metawiki.url2ext('https://github.com/mindey/schema/wiki/KeyEvent#mykeylogger-01')
# Then rename filename in the source, and just read file remotely or locally from disk:
df = metaform.read_csv('https://gist.githubusercontent.com/mindey/f33978b31468097b5003f032d5d85eb8/raw/9541191e4d99c052a7668223697ef0ef9ce37977/hello-world.GH~mindey+schema+KeyEvent@mykeylogger-01.csv', header=None)
metaform.load( DATA ).format( SCHEMA )
Let’s say we have some data:
data = {
'hello': 1.0,
'world': 2,
'how': ['is', {'are': {'you': 'doing'}}]
}
We can get the template for defining schema, by metaform.template
:
metaform.template(data)
{'*': '', 'hello': {'*': ''}, 'how': [{'*': '', 'are': {'you': {'*': ''}}}], 'world': {'*': ''}}
This provides an opportunity to specify metadata for each key and the object itself. For example:
schema = { # A # schema = {
'*': 'greeting', # L # '*': 'greeting',
'hello': {'*': 'length'}, # T # 'hello': 'length',
'world': {'*': 'atoms'}, # E # 'world': 'atoms',
'how': [ # R # 'how': [
{'*': 'method', # N # {'*': 'method',
'are': { # A # 'are': {
'*': 'yup', # T # '*': 'yup',
'you': {'*': 'me'}} # I # 'you': {'*': 'me'}}
} # V # }
]} # E # ]}
metaform.normalize(data, schema)
{'atoms': 2, 'length': 1.0, 'method': ['is', {'yup': {'me': 'doing'}}]}
We recommend saving schemas you create for normalizations for data
analytics and driver projects in
dot-folders .schema
, in a JSON or YAML files in that folder.
So, we have access to all keys, and can specify, what to do with them:
schema = {
'*': 'greeting',
'hello': 'length|lambda x: x+5.',
'world': 'atoms|lambda x: str(x)+"ABC"',
'how': [
{'*': 'method',
'are': {
'*': 'yup',
'you': {'*': 'me|lambda x: "-".join(list(x))'}}
}
]}
metaform.normalize(data, schema)
{'atoms': '2ABC', 'length': 6.0, 'method': ['is', {'yup': {'me': 'd-o-i-n-g'}}]}
And suppose, we want to define a more complex function, inconvenient via lambdas:
from metaform import converters
def some_func(x):
a = 123
b = 345
return (b-a)*x
converters.func = some_func
schema = {
'*': 'greeting',
'hello': 'length|to.func',
'world': 'atoms|lambda x: str(x)+"ABC"',
'how': [
{'*': 'method',
'are': {
'*': 'yup',
'you': {'*': 'me|lambda x: "-".join(list(x))'}}
}
]}
metaform.normalize(data, schema)
{'atoms': '2ABC', 'length': 222.0, 'method': ['is', {'yup': {'me': 'd-o-i-n-g'}}]}
We just renamed the keys, and normalized values! What else could we want?
Suppose we have similar data from different sources. For example, topics and comments are not so different after all, because if a comment becomes large enough, it can stand as a topic of its own.
topics = requests.get('https://api.infty.xyz/topics/?format=json').json()['results']
comments = requests.get('https://api.infty.xyz/comments/?format=json').json()['results']
Let’s define templates for them, with the key names and types to match:
topics_schema = [{
'id': 'topic-id',
'type': '|lambda x: {0: "NEED", 1: "GOAL", 2: "IDEA", 3: "PLAN", 4: "STEP", 5: "TASK"}.get(x)',
'owner': {'id': 'user-id'},
'blockchain': '|lambda x: x and True or False',
}]
normal_topics = metaform.normalize(topics, topics_schema)
topics_df = pandas.io.json.json_normalize(normal_topics)
topics_df.dtypes
blockchain bool body object categories object categories_names object children object comment_count int64 created_date object data object declared float64 editors object funds float64 is_draft bool languages object matched float64 owner.user-id int64 owner.username object parents object title object topic-id int64 type object updated_date object url object dtype: object
comments_schema = [{
'id': 'comment-id',
'topic': 'topic-url',
'text': 'body',
'owner': {'id': 'user-id'},
'blockchain': '|lambda x: x and True or False',
}]
normal_comments = metaform.normalize(comments, comments_schema)
comments_df = pandas.io.json.json_normalize(normal_comments)
comments_df.dtypes
assumed_hours object blockchain bool body object claimed_hours object comment-id int64 created_date object donated float64 languages object matched float64 owner.user-id int64 owner.username object parent object remains float64 topic-url object updated_date object url object dtype: object
df = pandas.concat([topics_df, comments_df], sort=False)