diff --git a/README.md b/README.md index 1ebf2c0..234094c 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,23 @@ # edf2parquet Simple utility package to convert EDF/EDF+ files into Apache Parquet format -while preserving the EDF file header information and signal headers metadata information. -Currently, each signal is stored as a separate parquet file, with the option to automatically -add a pandas readable DatetimeIndex. +while preserving the EDF file header information and signal headers metadata information and some nice enhanced features: +- handling of non-strictly EDF compliant .EDF headers (e.g. UTF-8 characters in the header, etc.). +- automatic conversion of the EDF file header start date and signal sampling frequency to a pd.DatetimeIndex with the correct timezone and frequency for easy Pandas interoperability (at the cost of slightly bigger file sizes of course). +- skipping of specific signals during conversion +- bundling signals with the same sampling frequency into a single parquet file +- splitting of EDF files by non-use periods (e.g. if a file consists of continuous multiple nights, and you want to split it into a single file per night) +- compression of the resulting parquet files + ## Installation +### Requirements +The package was tested with the pinned versions in the `requirements.txt` file. +If something does not work, try to install this exact versions. I would particularly advise +to use matching or more recent versions of PyArrow and Pandas (version 2.0 is important +as its using underlying Arrow datastructures itself, thus it will break with anything +below 2.0, as far as I'm aware). + ```bash pip install git+https://github.com/NarayanSchuetz/edf2parquet.git ``` @@ -16,7 +28,7 @@ Convert an EDF file into Apache parquet format using the EdfToParquetConverter c ```python import pytz -from edf2parquet.converters import EdfToParquetConverter +from edf2parquet.converters import EdfToParquetConverter, AdvancedEdfToParquetConverter my_edf_file_path = "path_to_my_edfile.edf" my_parquet_output_dir = "path_to_my_parquet_output_dir" @@ -27,6 +39,18 @@ converter = EdfToParquetConverter(edf_file_path=my_edf_file_path, parquet_output_dir=my_parquet_output_dir, compression_codec="GZIP") +converter.convert() + +# or alternatively using the advanced converter +converter = AdvancedEdfToParquetConverter(edf_file_path=my_edf_file_path, # path to the EDF file + exclude_signals=["Audio"], # list of signals to exclude from the conversion + parquet_output_dir=my_parquet_output_dir, # path to the output directory (will be created if not exists) + group_by_sampling_freq=True, # whether to group signals with same sampling frequency into single parquet files + datetime_index=True, # whether to automatically add a pd.DatetimeIndex to the resulting parquet files + local_timezone=(pytz.timezone("Europe/Zurich"), pytz.timezone("Europe/Zurich")), # specifies the timezone of the EDF file and the timezone of the start_date in the EDF file (should be the same for most cases) + compression_codec="GZIP", # compression codec to use for the resulting parquet files + split_non_use_by_col="MY_COLUMN") # only specify this if you want to split the EDF file by non-use periods (e.g. if a file consists of continuous multiple nights and you want to split it into a single file per night) -> read the docstring of the AdvancedEdfToParquetConverter class for more information. The column specifies the column to use for splitting the file. + converter.convert() ``` ### Reading: @@ -62,8 +86,7 @@ reader.get_signal_headers() Check the `examples.ipynb` notebook for detailed outputs. ## Todo -- [ ] Allow to bundle signals with the same sampling rate into a single parquet file. +- [x] Allow to bundle signals with the same sampling rate into a single parquet file. - [ ] Provide a high level user API - [ ] Enable (possibly distributed) parallel processing to efficiently convert a whole directory of EDF files. -- [ ] Provide a high level API to convert EDF files with the same sampling frequency (fs) into a single parquet file with a single row per signal. diff --git a/edf2parquet/converters.py b/edf2parquet/converters.py index fac4697..723d83d 100644 --- a/edf2parquet/converters.py +++ b/edf2parquet/converters.py @@ -50,7 +50,7 @@ def __init__( """ self._datetime_index = datetime_index self._default_signal_dtype = default_signal_dtype - self._edf_file = pyedflib.EdfReader(edf_file_path) + self._edf_reader = EdfReader(edf_file_path) self._parquet_output_dir = parquet_output_dir self._compression_codec = compression_codec self._local_timezone = local_timezone @@ -61,6 +61,10 @@ def __del__(self) -> None: def __repr__(self) -> str: return f"EdfToParquetConverter({self._edf_file.getHeader()})" + @property + def _edf_file(self): + return self._edf_reader.edf_file + def convert(self) -> Optional[Dict[str, pa.Table]]: """ Converts an EDF/EDF+ file to Apache Parquet file format. @@ -241,7 +245,6 @@ def __init__( """ super().__init__(edf_file_path, datetime_index, default_signal_dtype, parquet_output_dir, compression_codec, local_timezone) - self._edf_reader = EdfReader(edf_file_path) self._group_by_sampling_freq = group_by_sampling_freq self._exclude_signals = exclude_signals self._split_non_use_by_col = split_non_use_by_col diff --git a/examples.ipynb b/examples.ipynb index efe3c63..e5fe894 100644 --- a/examples.ipynb +++ b/examples.ipynb @@ -34,6 +34,41 @@ "converter.convert()" ] }, + { + "cell_type": "markdown", + "source": [ + "### Read an EDF file and convert it to Parquet files using the AdvancedEdfToParquetConverter class directly." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from edf2parquet.converters import AdvancedEdfToParquetConverter\n", + "import pytz\n", + "\n", + "my_edf_file = \"path_to_my_edfile.edf\" # REPLACE WITH YOUR EDF FILE PATH\n", + "my_parquet_output_dir = \"path_to_my_parquet_output_dir\" # REPLACE WITH YOUR PARQUET OUTPUT DIRECTORY\n", + "\n", + "converter = AdvancedEdfToParquetConverter(edf_file_path=my_edf_file, # path to the EDF file\n", + " exclude_signals=[\"Audio\"], # list of signals to exclude from the conversion\n", + " parquet_output_dir=my_parquet_output_dir, # path to the output directory (will be created if not exists)\n", + " group_by_sampling_freq=True, # whether to group signals with same sampling frequency into single parquet files\n", + " datetime_index=True, # whether to automatically add a pd.DatetimeIndex to the resulting parquet files\n", + " local_timezone=(pytz.timezone(\"Europe/Zurich\"), pytz.timezone(\"Europe/Zurich\")), # specifies the timezone of the EDF file and the timezone of the start_date in the EDF file (should be the same for most cases)\n", + " compression_codec=\"GZIP\", # compression codec to use for the resulting parquet files\n", + " split_non_use_by_col=\"MY_COLUMN\") # only specify this if you want to split the EDF file by non-use periods (e.g. if a file consists of continuous multiple nights and you want to split it into a single file per night) -> read the docstring of the AdvancedEdfToParquetConverter class for more information. The column specifies the column to use for splitting the file.\n", + "\n", + "converter.convert()" + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "markdown", "metadata": {}, @@ -220,19 +255,6 @@ "end_time": "2023-04-23T18:37:56.725993Z" } } - }, - { - "cell_type": "code", - "execution_count": 1, - "outputs": [], - "source": [], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-04-25T17:04:41.598905Z", - "end_time": "2023-04-25T17:04:41.602388Z" - } - } } ], "metadata": { diff --git a/setup.py b/setup.py index ccbcd3d..9a521ca 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ MAJOR = "0" MINOR = "1" -PATCH = "1" +PATCH = "2" _VERSION_TAG = "{MAJOR}.{MINOR}.{PATCH}".format(MAJOR=MAJOR, MINOR=MINOR, PATCH=PATCH) @@ -12,8 +12,8 @@ def get_version(): - import subprocess - commit_hash = str(subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout)[2:-3] + # import subprocess + # commit_hash = str(subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout)[2:-3] return '{VERSION_TAG}'.format(VERSION_TAG=_VERSION_TAG) @@ -30,9 +30,9 @@ def get_version(): install_requires=[ 'pytest', 'numpy', - 'pandas', + 'pandas>=2.0.0', 'pyarrow', - 'pyedflib==0.1.19', # newer versions + 'pyedflib>=0.1.32', ], setup_requires=[ 'pytest-runner',