Skip to content

Commit

Permalink
fix docs
Browse files Browse the repository at this point in the history
  • Loading branch information
yinochaos committed Sep 7, 2020
1 parent 46db92b commit f566ea8
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 20 deletions.
3 changes: 0 additions & 3 deletions 0.0.1

This file was deleted.

4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ release: dist ## package and upload a release
twine upload dist/*

dist: clean ## builds source and wheel package
python setup.py sdist
python setup.py bdist_wheel
python3 setup.py sdist
python3 setup.py bdist_wheel
ls -l dist

install: clean ## install the package to the active Python's site-packages
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
datasets for easy machine learning use

- Free software: Apache Software License 2.0
- Documentation: <https://datasets.readthedocs.io>.
- Documentation: <https://ml-dataset.readthedocs.io>.

## Datasets API
--------
Expand Down
32 changes: 32 additions & 0 deletions datasets/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,38 @@
"""Console script for datasets."""
import argparse
import sys
import os
import tensorflow as tf
from concurrent.futures import ThreadPoolExecutor
from datasets.utils import TokenDicts, DataSchema, data_processor_dicts
from datasets.raw_dataset import RawDataset


def convert2tfrecords(dataset_file_path, file_suffix, tfrecord_file_path, n_workers):
filenames = os.listdir(dataset_file_path)
#filenames = map(lambda x: os.path.join(dataset_file_path, x), filenames)
if file_suffix is None:
filenames = [f for f in list(filenames)]
else:
filenames = [f for f in list(filenames) if f.endswith(file_suffix)]

def processor(filepath, filename, tfrecord_filename):
token_dicts = None
data_filed_list = []
data_filed_list.append(DataSchema(name='query', processor='to_np', type=tf.int32,
dtype='int32', shape=(None,), is_with_len=True))
label_field = DataSchema(name='label', processor='to_np',
type=tf.float32, dtype='float32', shape=(1,), is_with_len=False)
generator = RawDataset(file_path=filepath, token_dicts=token_dicts,
data_field_list=data_filed_list, label_field=label_field, file_suffix=filename)
generator.to_tfrecords(tfrecord_filename)
return tfrecord_filename

task_param_list = [tuple(dataset_file_path, filename, tfrecord_file_path + '/' +
str(i) + '.tfrecord') for filename, i in zip(filenames, len(filenames))]
pool = ThreadPoolExecutor(max_workers=n_workers)
for result in pool.map(processor, task_param_list):
print(result, 'finish')


def main():
Expand Down
18 changes: 6 additions & 12 deletions datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,25 @@
"""

from __future__ import absolute_import, division, print_function
import sys
import os
import io
import subprocess
import tensorflow as tf
import numpy as np
import codecs
import subprocess
import pickle
from collections import namedtuple
#from collections import namedtuple
import logging

from datasets.utils import TokenDicts, DataSchema, data_processor_dicts


class Dataset(object):
"""从文件流创建的dataset
利用tf.dataset支持多文件输入(本地和HDFS同时支持),对于大规模数据集十分友好;
并通过token_dicts和datafields支持配置化的数据处理,灵活支持多输入数据集的处理
text(local , hdfs):当前已经支持
@TODO : 支持多种文件读取方式
pickle:
tfrecord:
LMDB(Lightning Memory-Mapped Database(快如闪电的内存映射数据库)):
HDF5:
@TODO : add weight_fn SUPPORT
pickle:finish TODO test
tfrecord:code&test finish
LMDB(Lightning Memory-Mapped Database(快如闪电的内存映射数据库)):TODO
HDF5:TODO
"""
##
# @brief
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ To install datasets, run this command in your terminal:

.. code-block:: console
$ pip install datasets
$ pip install ml-dataset
This is the preferred method to install datasets, as it will always install the most recent stable release.

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/yinochaos/datasets',
version='0.0.4',
version='0.0.6',
zip_safe=False,
)

0 comments on commit f566ea8

Please sign in to comment.