From cc8ddf73120a2d2a8fc4286e6bda517b021361b0 Mon Sep 17 00:00:00 2001 From: Chen Xie Date: Mon, 2 May 2022 15:47:40 -0700 Subject: [PATCH 1/3] Move conversion functions to their own modules --- docs/io.rst | 2 +- docs/processing.rst | 5 + docs/wfdb.rst | 2 +- tests/io/test_convert.py | 110 ++ tests/test_record.py | 106 +- wfdb/__init__.py | 7 - wfdb/io/__init__.py | 13 +- wfdb/io/annotation.py | 5 +- wfdb/io/convert/__init__.py | 0 wfdb/io/convert/csv.py | 480 +++++++ wfdb/io/convert/edf.py | 988 ++++++++++++++ wfdb/io/convert/matlab.py | 310 +++++ wfdb/io/{ => convert}/tff.py | 0 wfdb/io/convert/wav.py | 368 ++++++ wfdb/io/record.py | 2379 +--------------------------------- wfdb/processing/__init__.py | 1 + wfdb/processing/filter.py | 168 +++ 17 files changed, 2451 insertions(+), 2493 deletions(-) create mode 100644 tests/io/test_convert.py create mode 100644 wfdb/io/convert/__init__.py create mode 100644 wfdb/io/convert/csv.py create mode 100644 wfdb/io/convert/edf.py create mode 100644 wfdb/io/convert/matlab.py rename wfdb/io/{ => convert}/tff.py (100%) create mode 100644 wfdb/io/convert/wav.py create mode 100644 wfdb/processing/filter.py diff --git a/docs/io.rst b/docs/io.rst index 8fea7811..dacfdea4 100644 --- a/docs/io.rst +++ b/docs/io.rst @@ -9,7 +9,7 @@ WFDB Records --------------- .. automodule:: wfdb.io - :members: rdrecord, rdheader, rdsamp, wrsamp, edf2mit, mit2edf, wav2mit, mit2wav, csv2mit + :members: rdrecord, rdheader, rdsamp, wrsamp, read_edf, wfdb_to_edf, read_wav, wfdb_to_wav, csv_to_wfdb, wfdb_to_mat .. autoclass:: wfdb.io.Record :members: wrsamp, adc, dac diff --git a/docs/processing.rst b/docs/processing.rst index 6abbbb6a..61e8013f 100644 --- a/docs/processing.rst +++ b/docs/processing.rst @@ -26,6 +26,11 @@ Peaks .. automodule:: wfdb.processing :members: find_peaks, find_local_peaks, correct_peaks +Filters +------- + +.. automodule:: wfdb.processing + :members: sigavg QRS Detectors ------------- diff --git a/docs/wfdb.rst b/docs/wfdb.rst index 8ce5f91c..7580694f 100644 --- a/docs/wfdb.rst +++ b/docs/wfdb.rst @@ -9,7 +9,7 @@ WFDB Records --------------- .. automodule:: wfdb - :members: rdrecord, rdheader, rdsamp, wrsamp, edf2mit, mit2edf, wav2mit, mit2wav, csv2mit + :members: rdrecord, rdheader, rdsamp, wrsamp .. autoclass:: wfdb.Record :members: wrsamp, adc, dac diff --git a/tests/io/test_convert.py b/tests/io/test_convert.py new file mode 100644 index 00000000..52b21da5 --- /dev/null +++ b/tests/io/test_convert.py @@ -0,0 +1,110 @@ +import numpy as np + +import wfdb +from wfdb.io import read_edf + + +class TestConvert: + def test_edf_uniform(self): + """ + EDF format conversion to MIT for uniform sample rates. + + """ + # Uniform sample rates + record_MIT = wfdb.rdrecord("sample-data/n16").__dict__ + record_EDF = read_edf("sample-data/n16.edf").__dict__ + + fields = list(record_MIT.keys()) + # Original MIT format method of checksum is outdated, sometimes + # the same value though + fields.remove("checksum") + # Original MIT format units are less comprehensive since they + # default to mV if unknown.. therefore added more default labels + fields.remove("units") + + test_results = [] + for field in fields: + # Signal value will be slightly off due to C to Python type conversion + if field == "p_signal": + true_array = np.array(record_MIT[field]).flatten() + pred_array = np.array(record_EDF[field]).flatten() + # Prevent divide by zero warning + for i, v in enumerate(true_array): + if v == 0: + true_array[i] = 1 + pred_array[i] = 1 + sig_diff = np.abs((pred_array - true_array) / true_array) + sig_diff[sig_diff == -np.inf] = 0 + sig_diff[sig_diff == np.inf] = 0 + sig_diff = np.nanmean(sig_diff, 0) + # 5% tolerance + if np.max(sig_diff) <= 5: + test_results.append(True) + else: + test_results.append(False) + elif field == "init_value": + signal_diff = [ + abs(record_MIT[field][i] - record_EDF[field][i]) + for i in range(len(record_MIT[field])) + ] + if abs(max(min(signal_diff), max(signal_diff), key=abs)) <= 2: + test_results.append(True) + else: + test_results.append(False) + else: + test_results.append(record_MIT[field] == record_MIT[field]) + + target_results = len(fields) * [True] + assert np.array_equal(test_results, target_results) + + def test_edf_non_uniform(self): + """ + EDF format conversion to MIT for non-uniform sample rates. + + """ + # Non-uniform sample rates + record_MIT = wfdb.rdrecord("sample-data/wave_4").__dict__ + record_EDF = read_edf("sample-data/wave_4.edf").__dict__ + + fields = list(record_MIT.keys()) + # Original MIT format method of checksum is outdated, sometimes + # the same value though + fields.remove("checksum") + # Original MIT format units are less comprehensive since they + # default to mV if unknown.. therefore added more default labels + fields.remove("units") + + test_results = [] + for field in fields: + # Signal value will be slightly off due to C to Python type conversion + if field == "p_signal": + true_array = np.array(record_MIT[field]).flatten() + pred_array = np.array(record_EDF[field]).flatten() + # Prevent divide by zero warning + for i, v in enumerate(true_array): + if v == 0: + true_array[i] = 1 + pred_array[i] = 1 + sig_diff = np.abs((pred_array - true_array) / true_array) + sig_diff[sig_diff == -np.inf] = 0 + sig_diff[sig_diff == np.inf] = 0 + sig_diff = np.nanmean(sig_diff, 0) + # 5% tolerance + if np.max(sig_diff) <= 5: + test_results.append(True) + else: + test_results.append(False) + elif field == "init_value": + signal_diff = [ + abs(record_MIT[field][i] - record_EDF[field][i]) + for i in range(len(record_MIT[field])) + ] + if abs(max(min(signal_diff), max(signal_diff), key=abs)) <= 2: + test_results.append(True) + else: + test_results.append(False) + else: + test_results.append(record_MIT[field] == record_MIT[field]) + + target_results = len(fields) * [True] + assert np.array_equal(test_results, target_results) diff --git a/tests/test_record.py b/tests/test_record.py index d0f4da1d..eb2090cb 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -1,9 +1,9 @@ import os -import pdb import shutil import unittest import numpy as np + import wfdb @@ -339,110 +339,6 @@ def test_2e(self): sig_target = sig_target.reshape([977, 1]) assert np.array_equal(sig, sig_target) - def test_2f(self): - """ - EDF format conversion to MIT for uniform sample rates. - - """ - # Uniform sample rates - record_MIT = wfdb.rdrecord("sample-data/n16").__dict__ - record_EDF = wfdb.edf2mit("sample-data/n16.edf").__dict__ - - fields = list(record_MIT.keys()) - # Original MIT format method of checksum is outdated, sometimes - # the same value though - fields.remove("checksum") - # Original MIT format units are less comprehensive since they - # default to mV if unknown.. therefore added more default labels - fields.remove("units") - - test_results = [] - for field in fields: - # Signal value will be slightly off due to C to Python type conversion - if field == "p_signal": - true_array = np.array(record_MIT[field]).flatten() - pred_array = np.array(record_EDF[field]).flatten() - # Prevent divide by zero warning - for i, v in enumerate(true_array): - if v == 0: - true_array[i] = 1 - pred_array[i] = 1 - sig_diff = np.abs((pred_array - true_array) / true_array) - sig_diff[sig_diff == -np.inf] = 0 - sig_diff[sig_diff == np.inf] = 0 - sig_diff = np.nanmean(sig_diff, 0) - # 5% tolerance - if np.max(sig_diff) <= 5: - test_results.append(True) - else: - test_results.append(False) - elif field == "init_value": - signal_diff = [ - abs(record_MIT[field][i] - record_EDF[field][i]) - for i in range(len(record_MIT[field])) - ] - if abs(max(min(signal_diff), max(signal_diff), key=abs)) <= 2: - test_results.append(True) - else: - test_results.append(False) - else: - test_results.append(record_MIT[field] == record_MIT[field]) - - target_results = len(fields) * [True] - assert np.array_equal(test_results, target_results) - - def test_2g(self): - """ - EDF format conversion to MIT for non-uniform sample rates. - - """ - # Non-uniform sample rates - record_MIT = wfdb.rdrecord("sample-data/wave_4").__dict__ - record_EDF = wfdb.edf2mit("sample-data/wave_4.edf").__dict__ - - fields = list(record_MIT.keys()) - # Original MIT format method of checksum is outdated, sometimes - # the same value though - fields.remove("checksum") - # Original MIT format units are less comprehensive since they - # default to mV if unknown.. therefore added more default labels - fields.remove("units") - - test_results = [] - for field in fields: - # Signal value will be slightly off due to C to Python type conversion - if field == "p_signal": - true_array = np.array(record_MIT[field]).flatten() - pred_array = np.array(record_EDF[field]).flatten() - # Prevent divide by zero warning - for i, v in enumerate(true_array): - if v == 0: - true_array[i] = 1 - pred_array[i] = 1 - sig_diff = np.abs((pred_array - true_array) / true_array) - sig_diff[sig_diff == -np.inf] = 0 - sig_diff[sig_diff == np.inf] = 0 - sig_diff = np.nanmean(sig_diff, 0) - # 5% tolerance - if np.max(sig_diff) <= 5: - test_results.append(True) - else: - test_results.append(False) - elif field == "init_value": - signal_diff = [ - abs(record_MIT[field][i] - record_EDF[field][i]) - for i in range(len(record_MIT[field])) - ] - if abs(max(min(signal_diff), max(signal_diff), key=abs)) <= 2: - test_results.append(True) - else: - test_results.append(False) - else: - test_results.append(record_MIT[field] == record_MIT[field]) - - target_results = len(fields) * [True] - assert np.array_equal(test_results, target_results) - # --------------------- 3. Multi-dat records --------------------- # def test_3a(self): diff --git a/wfdb/__init__.py b/wfdb/__init__.py index c856cb65..aaa53499 100644 --- a/wfdb/__init__.py +++ b/wfdb/__init__.py @@ -6,17 +6,10 @@ rdsamp, wrsamp, dl_database, - edf2mit, - mit2edf, - wav2mit, - mit2wav, - wfdb2mat, - csv2mit, sampfreq, signame, wfdbdesc, wfdbtime, - sigavg, ) from wfdb.io.annotation import ( Annotation, diff --git a/wfdb/io/__init__.py b/wfdb/io/__init__.py index 0aca3cd5..d49665b5 100644 --- a/wfdb/io/__init__.py +++ b/wfdb/io/__init__.py @@ -6,17 +6,10 @@ rdsamp, wrsamp, dl_database, - edf2mit, - mit2edf, - wav2mit, - mit2wav, - wfdb2mat, - csv2mit, sampfreq, signame, wfdbdesc, wfdbtime, - sigavg, SIGNAL_CLASSES, ) from wfdb.io._signal import est_res, wr_dat_file @@ -38,4 +31,8 @@ dl_files, set_db_index_url, ) -from wfdb.io.tff import rdtff +from wfdb.io.convert.csv import csv_to_wfdb +from wfdb.io.convert.edf import read_edf, wfdb_to_edf +from wfdb.io.convert.matlab import wfdb_to_mat +from wfdb.io.convert.tff import rdtff +from wfdb.io.convert.wav import wfdb_to_wav, read_wav diff --git a/wfdb/io/annotation.py b/wfdb/io/annotation.py index c22a0a85..5be014c0 100644 --- a/wfdb/io/annotation.py +++ b/wfdb/io/annotation.py @@ -4,14 +4,13 @@ import pandas as pd import re import posixpath -import pdb import struct import sys from wfdb.io import download from wfdb.io import _header from wfdb.io import record - +from wfdb.io.convert.edf import read_edf class Annotation(object): """ @@ -3026,7 +3025,7 @@ def rdedfann( # "The coding is EDF compatible in the sense that old EDF software would # simply treat this 'EDF Annotations' signal as if it were a (strange- # looking) ordinary signal" - rec = record.edf2mit( + rec = read_edf( record_name, pn_dir=pn_dir, delete_file=delete_file, diff --git a/wfdb/io/convert/__init__.py b/wfdb/io/convert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wfdb/io/convert/csv.py b/wfdb/io/convert/csv.py new file mode 100644 index 00000000..21ae29c7 --- /dev/null +++ b/wfdb/io/convert/csv.py @@ -0,0 +1,480 @@ +import datetime +import os + +import numpy as np +import pandas as pd + +from wfdb.io import _header +from wfdb.io.record import Record, wrsamp + + +def csv_to_wfdb( + file_name, + fs, + units, + fmt=None, + adc_gain=None, + baseline=None, + samps_per_frame=None, + counter_freq=None, + base_counter=None, + base_time=None, + base_date=None, + comments=None, + sig_name=None, + dat_file_name=None, + skew=None, + byte_offset=None, + adc_res=None, + adc_zero=None, + init_value=None, + checksum=None, + block_size=None, + record_only=False, + header=True, + delimiter=",", + verbose=False, +): + """ + Read a WFDB header file and return either a `Record` object with the + record descriptors as attributes or write a record and header file. + + Parameters + ---------- + file_name : str + The name of the WFDB record to be read, without any file + extensions. If the argument contains any path delimiter + characters, the argument will be interpreted as PATH/BASE_RECORD. + Both relative and absolute paths are accepted. If the `pn_dir` + parameter is set, this parameter should contain just the base + record name, and the files fill be searched for remotely. + Otherwise, the data files will be searched for in the local path. + fs : float + This number can be expressed in any format legal for a Python input of + floating point numbers (thus '360', '360.', '360.0', and '3.6e2' are + all legal and equivalent). The sampling frequency must be greater than 0; + if it is missing, a value of 250 is assumed. + units : list, str + This will be applied as the passed list unless a single str is passed + instead - in which case the str will be assigned for all channels. + This field can be present only if the ADC gain is also present. It + follows the baseline field if that field is present, or the gain field + if the baseline field is absent. The units field is a list of character + strings that specifies the type of physical unit. If the units field is + absent, the physical unit may be assumed to be 1 mV. + fmt : list, str, optional + This will be applied as the passed list unless a single str is passed + instead - in which case the str will be assigned for all + channels. A list of strings giving the WFDB format of each file used to + store each channel. Accepted formats are: '80','212','16','24', and + '32'. There are other WFDB formats as specified by: + https://www.physionet.org/physiotools/wag/signal-5.htm + but this library will not write (though it will read) those file types. + Each field is an integer that specifies the storage format of the signal. + All signals in a given group are stored in the same format. The most + common format is format `16` (sixteen-bit amplitudes). The parameters + `samps_per_frame`, `skew`, and `byte_offset` are optional fields, and + if present, are bound to the format field. In other words, they may be + considered as format modifiers, since they further describe the encoding + of samples within the signal file. + adc_gain : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field is a list of numbers that specifies the difference in + sample values that would be observed if a step of one physical unit + occurred in the original analog signal. For ECGs, the gain is usually + roughly equal to the R-wave amplitude in a lead that is roughly parallel + to the mean cardiac electrical axis. If the gain is zero or missing, this + indicates that the signal amplitude is uncalibrated; in such cases, a + value of 200 ADC units per physical unit may be assumed. + baseline : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. This + field can be present only if the ADC gain is also present. It is not + separated by whitespace from the ADC gain field; rather, it is + surrounded by parentheses, which delimit it. The baseline is an integer + that specifies the sample value corresponding to 0 physical units. If + absent, the baseline is taken to be equal to the ADC zero. Note that + the baseline need not be a value within the ADC range; for example, + if the ADC input range corresponds to 200-300 degrees Kelvin, the + baseline is the (extended precision) value that would map to 0 degrees + Kelvin. + samps_per_frame : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + Normally, all signals in a given record are sampled at the (base) + sampling frequency as specified by `fs`; in this case, the number of + samples per frame is 1 for all signals, and this field is conventionally + omitted. If the signal was sampled at some integer multiple, n, of the + base sampling frequency, however, each frame contains n samples of the + signal, and the value specified in this field is also n. (Note that + non-integer multiples of the base sampling frequency are not supported). + counter_freq : float, optional + This field (a floating-point number, in the same format as `fs`) can be + present only if `fs` is also present. Typically, the counter frequency + may be derived from an analog tape counter, or from page numbers in a + chart recording. If the counter frequency is absent or not positive, + it is assumed to be equal to `fs`. + base_counter : float, optional + This field can be present only if the counter frequency is also present. + The base counter value is a floating-point number that specifies the counter + value corresponding to sample 0. If absent, the base counter value is + taken to be 0. + base_time : str, optional + This field can be present only if the number of samples is also present. + It gives the time of day that corresponds to the beginning of the + record, in 'HH:MM:SS' format (using a 24-hour clock; thus '13:05:00', or + '13:5:0', represent 1:05 pm). If this field is absent, the time-conversion + functions assume a value of '0:0:0', corresponding to midnight. + base_date : str, optional + This field can be present only if the base time is also present. It contains + the date that corresponds to the beginning of the record, in 'DD/MM/YYYY' + format (e.g., '25/4/1989' is '25 April 1989'). + comments : list, optional + A list of string comments to be written to the header file. Each string + entry represents a new line to be appended to the bottom of the header + file ('.hea'). + sig_name : list, optional + A list of strings giving the signal name of each signal channel. This + will be used for plotting the signal both in this package and + LightWave. Note, this value will be used in preference to the CSV + header, if applicable, to define custom signal names. + dat_file_name : str, optional + The name of the file in which samples of the signal are kept. Although the + record name is usually part of the signal file name, this convention is + not a requirement. Note that several signals can share the same file + (i.e., they can belong to the same signal group); all entries for signals + that share a given file must be consecutive, however. Note, the default + behavior is to save the files in the current working directory, not the + directory of the file being read. + skew : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + Ideally, within a given record, samples of different signals with the + same sample number are simultaneous (within one sampling interval). + If this is not the case (as, for example, when a multitrack analog + tape recording is digitized and the azimuth of the playback head does + not match that of the recording head), the skew between signals can + sometimes be determined (for example, by locating recorded waveform + features with known time relationships, such as calibration signals). + If this has been done, the skew field may be inserted into the header + file to indicate the (positive) number of samples of the signal that + are considered to precede sample 0. These samples, if any, are included + in the checksum. (Note the checksum need not be changed if the skew field + is inserted or modified). + byte_offset : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + Normally, signal files include only sample data. If a signal file + includes a preamble, however, this field specifies the offset in bytes + from the beginning of the signal file to sample 0 (i.e., the length + of the preamble). Data within the preamble is not included in the signal + checksum. Note that the byte offset must be the same for all signals + within a given group (use the skew field to correct for intersignal + skew). This feature is provided only to simplify the task of reading + signal files not generated using the WFDB library; the WFDB library + does not support any means of writing such files, and byte offsets must + be inserted into header files manually. + adc_res: list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the ADC gain is also present. It + specifies the resolution of the analog-to-digital converter used to + digitize the signal. Typical ADCs have resolutions between 8 and 16 + bits. If this field is missing or zero, the default value is 12 bits + for amplitude-format signals, or 10 bits for difference-format signals + (unless a lower value is specified by the format field). + adc_zero: list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the ADC resolution is also present. + It is an integer that represents the amplitude (sample value) that + would be observed if the analog signal present at the ADC inputs had + a level that fell exactly in the middle of the input range of the ADC. + For a bipolar ADC, this value is usually zero, but a unipolar (offset + binary) ADC usually produces a non-zero value in the middle of its + range. Together with the ADC resolution, the contents of this field + can be used to determine the range of possible sample values. If this + field is missing, a value of 0 is assumed. + init_value : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the ADC zero is also present. It + specifies the value of sample 0 in the signal, but is used only if the + signal is stored in difference format. If this field is missing, a + value equal to the ADC zero is assumed. + checksum : list, optional + This field can be present only if the initial value is also present. It + is a 16-bit signed checksum of all samples in the signal. (Thus the + checksum is independent of the storage format.) If the entire record + is read without skipping samples, and the header’s record line specifies + the correct number of samples per signal, this field is compared against + a computed checksum to verify that the signal file has not been corrupted. + A value of zero may be used as a field placeholder if the number of + samples is unspecified. + block_size : list, int, optional + This will be applied as the passed list unless a single int is passed + instead - in which case the int will be assigned for all channels. + This field can be present only if the checksum is present. This field + is an integer and is usually 0. If the signal is stored in a file + that must be read in blocks of a specific size, however, this field + specifies the block size in bytes. (On UNIX systems, this is the case + only for character special files, corresponding to certain tape and + raw disk files. If necessary, the block size may be given as a negative + number to indicate that the associated file lacks I/O driver support for + some operations.) All signals belonging to the same signal group have + the same block size. + record_only : bool, optional + Whether to only return the record information (True) or not (False). + If false, this function will generate both a .dat and .hea file. + header : bool, optional + Whether to assume the CSV has a first line header (True) or not (False) + which defines the signal names. If false, this function will generate + either the signal names provided by `sig_name` or set `[ch_1, ch_2, ...]` + as the default. + delimiter : str, optional + What to use as the delimiter for the file to separate data. The default + if a comma (','). Other common delimiters are tabs ('\t'), spaces (' '), + pipes ('|'), and colons (':'). + verbose : bool, optional + Whether to print all the information read about the file (True) or + not (False). + + Returns + ------- + record : Record or MultiRecord, optional + The WFDB Record or MultiRecord object representing the contents + of the CSV file read. + + Notes + ----- + CSVs should be in the following format: + + sig_1_name,sig_2_name,... + sig_1_val_1,sig_2_val_1,... + sig_1_val_2,sig_2_val_2,... + ...,...,... + + Or this format if `header=False` is defined: + + sig_1_val_1,sig_2_val_1,... + sig_1_val_2,sig_2_val_2,... + ...,...,... + + The signal will be saved defaultly as a `p_signal` so both floats and + ints are acceptable. + + Examples + -------- + Create the header ('.hea') and record ('.dat') files, specifies both + units to be 'mV' + >>> csv_to_wfdb('sample-data/100.csv', fs=360, units='mV') + + Create the header ('.hea') and record ('.dat') files, change units for + each signal + >>> csv_to_wfdb('sample-data/100.csv', fs=360, units=['mV','kV']) + + Return just the record, note the use of lists to specify which values should + be applied to each signal + >>> csv_record = csv_to_wfdb('sample-data/100.csv', fs=360, units=['mV','mV'], + fmt=['80',212'], adc_gain=[100,200], + baseline=[1024,512], record_only=True) + + Return just the record, note the use of single strings and ints to specify + when fields can be applied to all signals + >>> csv_record = csv_to_wfdb('sample-data/100.csv', fs=360, units='mV', + fmt=['80','212'], adc_gain=200, baseline=1024, + record_only=True) + + """ + # NOTE: No need to write input checks here since the Record class should + # handle them (except verifying the CSV input format which is for Pandas) + if header: + df_CSV = pd.read_csv(file_name, delimiter=delimiter) + else: + df_CSV = pd.read_csv(file_name, delimiter=delimiter, header=None) + if verbose: + print("Successfully read CSV") + # Extract the entire signal from the dataframe + p_signal = df_CSV.values + # The dataframe should be in (`sig_len`, `n_sig`) dimensions + sig_len = p_signal.shape[0] + if verbose: + print("Signal length: {}".format(sig_len)) + n_sig = p_signal.shape[1] + if verbose: + print("Number of signals: {}".format(n_sig)) + # Check if signal names are valid and set defaults + if not sig_name: + if header: + sig_name = df_CSV.columns.to_list() + if any(map(str.isdigit, sig_name)): + print( + "WARNING: One or more of your signal names are numbers, this " + "is not recommended:\n- Does your CSV have a header line " + "which defines the signal names?\n- If not, please set the " + "parameter 'header' to False.\nSignal names: {}".format( + sig_name + ) + ) + else: + sig_name = ["ch_" + str(i) for i in range(n_sig)] + if verbose: + print("Signal names: {}".format(sig_name)) + + # Set the output header file name to be the same, remove path + if os.sep in file_name: + file_name = file_name.split(os.sep)[-1] + record_name = file_name.replace(".csv", "") + if verbose: + print("Output header: {}.hea".format(record_name)) + + # Replace the CSV file tag with DAT + dat_file_name = file_name.replace(".csv", ".dat") + dat_file_name = [dat_file_name] * n_sig + if verbose: + print("Output record: {}".format(dat_file_name[0])) + + # Convert `units` from string to list if necessary + units = [units] * n_sig if type(units) is str else units + + # Set the default `fmt` if none exists + if not fmt: + fmt = ["16"] * n_sig + fmt = [fmt] * n_sig if type(fmt) is str else fmt + if verbose: + print("Signal format: {}".format(fmt)) + + # Set the default `adc_gain` if none exists + if not adc_gain: + adc_gain = [200] * n_sig + adc_gain = [adc_gain] * n_sig if type(adc_gain) is int else adc_gain + if verbose: + print("Signal ADC gain: {}".format(adc_gain)) + + # Set the default `baseline` if none exists + if not baseline: + if adc_zero: + baseline = [adc_zero] * n_sig + else: + baseline = [0] * n_sig + baseline = [baseline] * n_sig if type(baseline) is int else baseline + if verbose: + print("Signal baseline: {}".format(baseline)) + + # Convert `samps_per_frame` from int to list if necessary + samps_per_frame = ( + [samps_per_frame] * n_sig + if type(samps_per_frame) is int + else samps_per_frame + ) + + # Convert `skew` from int to list if necessary + skew = [skew] * n_sig if type(skew) is int else skew + + # Convert `byte_offset` from int to list if necessary + byte_offset = ( + [byte_offset] * n_sig if type(byte_offset) is int else byte_offset + ) + + # Set the default `adc_res` if none exists + if not adc_res: + adc_res = [12] * n_sig + adc_res = [adc_res] * n_sig if type(adc_res) is int else adc_res + if verbose: + print("Signal ADC resolution: {}".format(adc_res)) + + # Set the default `adc_zero` if none exists + if not adc_zero: + adc_zero = [0] * n_sig + adc_zero = [adc_zero] * n_sig if type(adc_zero) is int else adc_zero + if verbose: + print("Signal ADC zero: {}".format(adc_zero)) + + # Set the default `init_value` + # NOTE: Initial value (and subsequently the digital signal) won't be correct + # unless the correct `baseline` and `adc_gain` are provided... this is just + # the best approximation + if not init_value: + init_value = p_signal[0, :] + init_value = baseline + (np.array(adc_gain) * init_value) + init_value = [int(i) for i in init_value.tolist()] + if verbose: + print("Signal initial value: {}".format(init_value)) + + # Set the default `checksum` + if not checksum: + checksum = [int(np.sum(v) % 65536) for v in np.transpose(p_signal)] + if verbose: + print("Signal checksum: {}".format(checksum)) + + # Set the default `block_size` + if not block_size: + block_size = [0] * n_sig + block_size = [block_size] * n_sig if type(block_size) is int else block_size + if verbose: + print("Signal block size: {}".format(block_size)) + + # Change the dates and times into `datetime` objects + if base_time: + base_time = _header.wfdb_strptime(base_time) + if base_date: + base_date = datetime.datetime.strptime(base_date, "%d/%m/%Y").date() + + # Convert array to floating point + p_signal = p_signal.astype("float64") + + # Either return the record or generate the record and header files + # if requested + if record_only: + # Create the record from the input and generated values + record = Record( + record_name=record_name, + n_sig=n_sig, + fs=fs, + samps_per_frame=samps_per_frame, + counter_freq=counter_freq, + base_counter=base_counter, + sig_len=sig_len, + base_time=base_time, + base_date=base_date, + comments=comments, + sig_name=sig_name, + p_signal=p_signal, + d_signal=None, + e_p_signal=None, + e_d_signal=None, + file_name=dat_file_name, + fmt=fmt, + skew=skew, + byte_offset=byte_offset, + adc_gain=adc_gain, + baseline=baseline, + units=units, + adc_res=adc_res, + adc_zero=adc_zero, + init_value=init_value, + checksum=checksum, + block_size=block_size, + ) + if verbose: + print("Record generated successfully") + return record + + else: + # Write the information to a record and header file + wrsamp( + record_name=record_name, + fs=fs, + units=units, + sig_name=sig_name, + p_signal=p_signal, + fmt=fmt, + adc_gain=adc_gain, + baseline=baseline, + comments=comments, + base_time=base_time, + base_date=base_date, + ) + if verbose: + print("File generated successfully") diff --git a/wfdb/io/convert/edf.py b/wfdb/io/convert/edf.py new file mode 100644 index 00000000..17ae8b68 --- /dev/null +++ b/wfdb/io/convert/edf.py @@ -0,0 +1,988 @@ +import datetime +import functools +import math +import os +import posixpath +import struct + +import numpy as np + +from wfdb.io.record import Record, rdrecord, SIG_UNITS, get_version +from wfdb.io import _url +from wfdb.io import download + + +def read_edf( + record_name, + pn_dir=None, + header_only=False, + verbose=False, + rdedfann_flag=False, +): + """ + Read a EDF format file into a WFDB Record. + + Many EDF files contain signals at widely varying sampling frequencies. + `read_edf` handles these properly, but the default behavior of most WFDB + applications is to read such data in low-resolution mode (in which all + signals are resampled at the lowest sampling frequency used for any signal + in the record). This is almost certainly not what you want if, for + example, the record contains EEG signals sampled at 200 Hz and body + temperature sampled at 1 Hz; by default, applications such as `rdsamp` + will resample the EEGs (and any other signals in the record) at 1 Hz. To + avoid this behavior, you can set `smooth_frames` to False (high resolution) + provided by `rdrecord` and a few other WFDB applications. + + Note that applications built using version 3.1.0 and later versions of + the WFDB-Python library can read EDF files directly, so that the conversion + performed by `read_edf` is no longer necessary. However, one can still use + this function to produce WFDB-compatible files from EDF files if desired. + + Parameters + ---------- + record_name : str + The name of the input EDF record to be read. + pn_dir : str, optional + Option used to stream data from Physionet. The Physionet + database directory from which to find the required record files. + eg. For record '100' in 'http://physionet.org/content/mitdb' + pn_dir='mitdb'. + header_only : bool, optional + Whether to only return the header information (True) or not (False). + If true, this function will only return `['fs', 'sig_len', 'n_sig', + 'base_date', 'base_time', 'units', 'sig_name', 'comments']`. + verbose : bool, optional + Whether to print all the information read about the file (True) or + not (False). + rdedfann_flag : bool, optional + Whether the function is being called by `rdedfann` or the user. If it + is being called by the user and the file has annotations, then warn + them that the EDF file has annotations and that they should use + `rdedfann` instead. + + Returns + ------- + record : dict, optional + All of the record information needed to generate MIT formatted files. + Only returns if 'record_only' is set to True, else generates the + corresponding .dat and .hea files. This record file will not match the + `rdrecord` output since it will only give us the digital signal for now. + + Notes + ----- + The entire file is composed of (seen here: https://www.edfplus.info/specs/edf.html): + + HEADER RECORD (we suggest to also adopt the 12 simple additional EDF+ specs) + 8 ascii : version of this data format (0) + 80 ascii : local patient identification (mind item 3 of the additional EDF+ specs) + 80 ascii : local recording identification (mind item 4 of the additional EDF+ specs) + 8 ascii : startdate of recording (dd.mm.yy) (mind item 2 of the additional EDF+ specs) + 8 ascii : starttime of recording (hh.mm.ss) + 8 ascii : number of bytes in header record + 44 ascii : reserved + 8 ascii : number of data records (-1 if unknown, obey item 10 of the additional EDF+ specs) + 8 ascii : duration of a data record, in seconds + 4 ascii : number of signals (ns) in data record + ns * 16 ascii : ns * label (e.g. EEG Fpz-Cz or Body temp) (mind item 9 of the additional EDF+ specs) + ns * 80 ascii : ns * transducer type (e.g. AgAgCl electrode) + ns * 8 ascii : ns * physical dimension (e.g. uV or degreeC) + ns * 8 ascii : ns * physical minimum (e.g. -500 or 34) + ns * 8 ascii : ns * physical maximum (e.g. 500 or 40) + ns * 8 ascii : ns * digital minimum (e.g. -2048) + ns * 8 ascii : ns * digital maximum (e.g. 2047) + ns * 80 ascii : ns * prefiltering (e.g. HP:0.1Hz LP:75Hz) + ns * 8 ascii : ns * nr of samples in each data record + ns * 32 ascii : ns * reserved + + DATA RECORD + nr of samples[1] * integer : first signal in the data record + nr of samples[2] * integer : second signal + .. + .. + nr of samples[ns] * integer : last signal + + Bytes 0 - 127: descriptive text + Bytes 128 - 131: master tag (data type = matrix) + Bytes 132 - 135: master tag (data size) + Bytes 136 - 151: array flags (4 byte tag with data type, 4 byte + tag with subelement size, 8 bytes of content) + Bytes 152 - 167: array dimension (4 byte tag with data type, 4 + byte tag with subelement size, 8 bytes of content) + Bytes 168 - 183: array name (4 byte tag with data type, 4 byte + tag with subelement size, 8 bytes of content) + Bytes 184 - ...: array content (4 byte tag with data type, 4 byte + tag with subelement size, ... bytes of content) + + Examples + -------- + >>> record = read_edf('x001_FAROS.edf', + pn_dir='simultaneous-measurements/raw_data') + + """ + if pn_dir is not None: + + if "." not in pn_dir: + dir_list = pn_dir.split("/") + pn_dir = posixpath.join( + dir_list[0], get_version(dir_list[0]), *dir_list[1:] + ) + + file_url = posixpath.join(download.PN_INDEX_URL, pn_dir, record_name) + # Currently must download file for MNE to read it though can give the + # user the option to delete it immediately afterwards + with _url.openurl(file_url, "rb") as f: + open(record_name, "wb").write(f.read()) + + # Open the desired file + edf_file = open(record_name, mode="rb") + + # Version of this data format (8 bytes) + version = struct.unpack("<8s", edf_file.read(8))[0].decode() + + # Check to see that the input is an EDF file. (This check will detect + # most but not all other types of files.) + if version != "0 ": + raise Exception( + "Input does not appear to be EDF -- no conversion attempted" + ) + else: + if verbose: + print("EDF version number: {}".format(version.strip())) + + # Local patient identification (80 bytes) + patient_id = struct.unpack("<80s", edf_file.read(80))[0].decode() + if verbose: + print("Patient ID: {}".format(patient_id)) + + # Local recording identification (80 bytes) + # Bob Kemp recommends using this field to encode the start date + # including an abbreviated month name in English and a full (4-digit) + # year, as is done here if this information is available in the input + # record. EDF+ requires this. + record_id = struct.unpack("<80s", edf_file.read(80))[0].decode() + if verbose: + print("Recording ID: {}".format(record_id)) + + # Start date of recording (dd.mm.yy) (8 bytes) + start_date = struct.unpack("<8s", edf_file.read(8))[0].decode() + if verbose: + print("Recording Date: {}".format(start_date)) + start_day, start_month, start_year = [int(i) for i in start_date.split(".")] + # This should work for a while + if start_year < 1970: + start_year += 1900 + if start_year < 1970: + start_year += 100 + + # Start time of recording (hh.mm.ss) (8 bytes) + start_time = struct.unpack("<8s", edf_file.read(8))[0].decode() + if verbose: + print("Recording Time: {}".format(start_time)) + start_hour, start_minute, start_second = [ + int(i) for i in start_time.split(".") + ] + + # Number of bytes in header (8 bytes) + header_bytes = int(struct.unpack("<8s", edf_file.read(8))[0].decode()) + if verbose: + print("Number of bytes in header record: {}".format(header_bytes)) + + # Reserved (44 bytes) + reserved_notes = ( + struct.unpack("<44s", edf_file.read(44))[0].decode().strip() + ) + if reserved_notes[:5] == "EDF+C": + # The file is EDF compatible and will work without issue + # See: Bob Kemp, Jesus Olivan, European data format ‘plus’ (EDF+), an + # EDF alike standard format for the exchange of physiological + # data, Clinical Neurophysiology, Volume 114, Issue 9, 2003, + # Pages 1755-1761, ISSN 1388-2457 + pass + elif reserved_notes[:5] == "EDF+D": + raise Exception( + "EDF+ File: interrupted data records (not currently supported)" + ) + else: + if verbose: + print("Free Space: {}".format(reserved_notes)) + + # Number of blocks (-1 if unknown) (8 bytes) + num_blocks = int(struct.unpack("<8s", edf_file.read(8))[0].decode()) + if verbose: + print("Number of data records: {}".format(num_blocks)) + if num_blocks == -1: + raise Exception( + "Number of data records in unknown (not currently supported)" + ) + + # Duration of a block, in seconds (8 bytes) + block_duration = float(struct.unpack("<8s", edf_file.read(8))[0].decode()) + if verbose: + print( + "Duration of each data record in seconds: {}".format(block_duration) + ) + if block_duration <= 0.0: + block_duration = 1.0 + + # Number of signals (4 bytes) + n_sig = int(struct.unpack("<4s", edf_file.read(4))[0].decode()) + if verbose: + print("Number of signals: {}".format(n_sig)) + if n_sig < 1: + raise Exception("Done: not any signals left to read") + + # Label (e.g., EEG FpzCz or Body temp) (16 bytes each) + sig_name = [] + for _ in range(n_sig): + temp_sig = struct.unpack("<16s", edf_file.read(16))[0].decode().strip() + if temp_sig == "EDF Annotations" and not rdedfann_flag: + print( + "*** This may be an EDF+ Annotation file instead, please see " + "the `rdedfann` function. ***" + ) + sig_name.append(temp_sig) + if verbose: + print("Signal Labels: {}".format(sig_name)) + + # Transducer type (e.g., AgAgCl electrode) (80 bytes each) + transducer_types = [] + for _ in range(n_sig): + transducer_types.append( + struct.unpack("<80s", edf_file.read(80))[0].decode().strip() + ) + if verbose: + print("Transducer Types: {}".format(transducer_types)) + + # Physical dimension (e.g., uV or degreeC) (8 bytes each) + physical_dims = [] + for _ in range(n_sig): + physical_dims.append( + struct.unpack("<8s", edf_file.read(8))[0].decode().strip() + ) + if verbose: + print("Physical Dimensions: {}".format(physical_dims)) + + # Physical minimum (e.g., -500 or 34) (8 bytes each) + physical_min = np.array([]) + for _ in range(n_sig): + physical_min = np.append( + physical_min, + float(struct.unpack("<8s", edf_file.read(8))[0].decode()), + ) + if verbose: + print("Physical Minimums: {}".format(physical_min)) + + # Physical maximum (e.g., 500 or 40) (8 bytes each) + physical_max = np.array([]) + for _ in range(n_sig): + physical_max = np.append( + physical_max, + float(struct.unpack("<8s", edf_file.read(8))[0].decode()), + ) + if verbose: + print("Physical Maximums: {}".format(physical_max)) + + # Digital minimum (e.g., -2048) (8 bytes each) + digital_min = np.array([]) + for _ in range(n_sig): + digital_min = np.append( + digital_min, + float(struct.unpack("<8s", edf_file.read(8))[0].decode()), + ) + if verbose: + print("Digital Minimums: {}".format(digital_min)) + + # Digital maximum (e.g., 2047) (8 bytes each) + digital_max = np.array([]) + for _ in range(n_sig): + digital_max = np.append( + digital_max, + float(struct.unpack("<8s", edf_file.read(8))[0].decode()), + ) + if verbose: + print("Digital Maximums: {}".format(digital_max)) + + # Prefiltering (e.g., HP:0.1Hz LP:75Hz) (80 bytes each) + prefilter_info = [] + for _ in range(n_sig): + prefilter_info.append( + struct.unpack("<80s", edf_file.read(80))[0].decode().strip() + ) + if verbose: + print("Prefiltering Information: {}".format(prefilter_info)) + + # Number of samples per block (8 bytes each) + samps_per_block = [] + for _ in range(n_sig): + samps_per_block.append( + int(struct.unpack("<8s", edf_file.read(8))[0].decode()) + ) + if verbose: + print("Number of Samples per Record: {}".format(samps_per_block)) + + # The last 32*nsig bytes in the header are unused + for _ in range(n_sig): + struct.unpack("<32s", edf_file.read(32))[0].decode() + + # Pre-process the acquired data before creating the record + record_name_out = ( + record_name.split(os.sep)[-1].replace("-", "_").replace(".edf", "") + ) + sample_rate = [int(i / block_duration) for i in samps_per_block] + fs = functools.reduce(math.gcd, sample_rate) + samps_per_frame = [int(s / min(samps_per_block)) for s in samps_per_block] + sig_len = int(fs * num_blocks * block_duration) + base_time = datetime.time(start_hour, start_minute, start_second) + base_date = datetime.date(start_year, start_month, start_day) + file_name = n_sig * [record_name_out + ".dat"] + fmt = n_sig * ["16"] + skew = n_sig * [None] + byte_offset = n_sig * [None] + adc_gain_all = (digital_max - digital_min) / (physical_max - physical_min) + adc_gain = [float(format(a, ".12g")) for a in adc_gain_all] + baseline = (digital_max - (physical_max * adc_gain_all) + 1).astype("int64") + + units = n_sig * [""] + for i, f in enumerate(physical_dims): + if f == "n/a": + label = sig_name[i].lower().split()[0] + if label in list(SIG_UNITS.keys()): + units[i] = SIG_UNITS[label] + else: + units[i] = "n/a" + else: + f = f.replace("µ", "u") # Maybe more weird symbols to check for? + if f == "": + units[i] = "mV" + else: + units[i] = f + + adc_res = [int(math.log2(f)) for f in (digital_max - digital_min)] + adc_zero = [int(f) for f in ((digital_max + 1 + digital_min) / 2)] + block_size = n_sig * [0] + base_datetime = datetime.datetime( + start_year, + start_month, + start_day, + start_hour, + start_minute, + start_second, + ) + base_time = datetime.time( + base_datetime.hour, base_datetime.minute, base_datetime.second + ) + base_date = datetime.date( + base_datetime.year, base_datetime.month, base_datetime.day + ) + + if header_only: + return { + "fs": fs, + "sig_len": sig_len, + "n_sig": n_sig, + "base_date": base_date, + "base_time": base_time, + "units": units, + "sig_name": sig_name, + "comments": [], + } + + sig_data = np.empty((sig_len, n_sig)) + temp_sig_data = np.fromfile(edf_file, dtype=np.int16) + temp_sig_data = temp_sig_data.reshape((-1, sum(samps_per_block))) + temp_all_sigs = np.hsplit(temp_sig_data, np.cumsum(samps_per_block)[:-1]) + for i in range(n_sig): + # Check if `samps_per_frame` has all equal values + if samps_per_frame.count(samps_per_frame[0]) == len(samps_per_frame): + sig_data[:, i] = ( + temp_all_sigs[i].flatten() - baseline[i] + ) / adc_gain_all[i] + else: + temp_sig_data = temp_all_sigs[i].flatten() + if samps_per_frame[i] == 1: + sig_data[:, i] = (temp_sig_data - baseline[i]) / adc_gain_all[i] + else: + for j in range(sig_len): + start_ind = j * samps_per_frame[i] + stop_ind = start_ind + samps_per_frame[i] + sig_data[j, i] = np.mean( + (temp_sig_data[start_ind:stop_ind] - baseline[i]) + / adc_gain_all[i] + ) + + # This is the closest I can get to the original implementation + # NOTE: This is done using `np.testing.assert_array_equal()` + # Mismatched elements: 15085545 / 15400000 (98%) + # Max absolute difference: 3.75166564e-12 + # Max relative difference: 5.41846079e-15 + # x: array([[ -3.580728, 42.835293, -102.818048, 54.978632, -52.354247], + # [ -8.340205, 43.079939, -102.106351, 56.402027, -44.992626], + # [ -5.004123, 43.546991, -99.481966, 51.64255 , -43.079939],... + # y: array([[ -3.580728, 42.835293, -102.818048, 54.978632, -52.354247], + # [ -8.340205, 43.079939, -102.106351, 56.402027, -44.992626], + # [ -5.004123, 43.546991, -99.481966, 51.64255 , -43.079939],... + + init_value = [int(s[0, 0]) for s in temp_all_sigs] + checksum = [ + int(np.sum(v) % 65536) for v in np.transpose(sig_data) + ] # not all values correct? + + record = Record( + record_name=record_name_out, + n_sig=n_sig, + fs=fs, + samps_per_frame=samps_per_frame, + counter_freq=None, + base_counter=None, + sig_len=sig_len, + base_time=base_time, + base_date=base_date, + comments=[], + sig_name=sig_name, # Remove whitespace to make compatible later? + p_signal=sig_data, + d_signal=None, + e_p_signal=None, + e_d_signal=None, + file_name=n_sig * [record_name_out + ".dat"], + fmt=n_sig * ["16"], + skew=n_sig * [None], + byte_offset=n_sig * [None], + adc_gain=adc_gain, + baseline=baseline, + units=units, + adc_res=[int(math.log2(f)) for f in (digital_max - digital_min)], + adc_zero=[int(f) for f in ((digital_max + 1 + digital_min) / 2)], + init_value=init_value, + checksum=checksum, + block_size=n_sig * [0], + ) + + record.base_datetime = base_datetime + + return record + + +def wfdb_to_edf( + record_name, + pn_dir=None, + sampfrom=0, + sampto=None, + channels=None, + output_filename="", + edf_plus=False, +): + """ + These programs convert EDF (European Data Format) files into + WFDB-compatible files (as used in PhysioNet) and vice versa. European + Data Format (EDF) was originally designed for storage of polysomnograms. + + Note that WFDB format does not include a standard way to specify the + transducer type or the prefiltering specification; these parameters are + not preserved by these conversion programs. Also note that use of the + standard signal and unit names specified for EDF is permitted but not + enforced by `wfdb_to_edf`. + + Parameters + ---------- + record_name : str + The name of the input WFDB record to be read. Can also work with both + EDF and WAV files. + pn_dir : str, optional + Option used to stream data from Physionet. The Physionet + database directory from which to find the required record files. + eg. For record '100' in 'http://physionet.org/content/mitdb' + pn_dir='mitdb'. + sampfrom : int, optional + The starting sample number to read for all channels. + sampto : int, 'end', optional + The sample number at which to stop reading for all channels. + Reads the entire duration by default. + channels : list, optional + List of integer indices specifying the channels to be read. + Reads all channels by default. + output_filename : str, optional + The desired name of the output file. If this value set to the + default value of '', then the output filename will be 'REC.edf'. + edf_plus : bool, optional + Whether to write the output file in EDF (False) or EDF+ (True) format. + + Returns + ------- + N/A + + Notes + ----- + The entire file is composed of (seen here: https://www.edfplus.info/specs/edf.html): + + HEADER RECORD (we suggest to also adopt the 12 simple additional EDF+ specs) + 8 ascii : version of this data format (0) + 80 ascii : local patient identification (mind item 3 of the additional EDF+ specs) + 80 ascii : local recording identification (mind item 4 of the additional EDF+ specs) + 8 ascii : startdate of recording (dd.mm.yy) (mind item 2 of the additional EDF+ specs) + 8 ascii : starttime of recording (hh.mm.ss) + 8 ascii : number of bytes in header record + 44 ascii : reserved + 8 ascii : number of data records (-1 if unknown, obey item 10 of the additional EDF+ specs) + 8 ascii : duration of a data record, in seconds + 4 ascii : number of signals (ns) in data record + ns * 16 ascii : ns * label (e.g. EEG Fpz-Cz or Body temp) (mind item 9 of the additional EDF+ specs) + ns * 80 ascii : ns * transducer type (e.g. AgAgCl electrode) + ns * 8 ascii : ns * physical dimension (e.g. uV or degreeC) + ns * 8 ascii : ns * physical minimum (e.g. -500 or 34) + ns * 8 ascii : ns * physical maximum (e.g. 500 or 40) + ns * 8 ascii : ns * digital minimum (e.g. -2048) + ns * 8 ascii : ns * digital maximum (e.g. 2047) + ns * 80 ascii : ns * prefiltering (e.g. HP:0.1Hz LP:75Hz) + ns * 8 ascii : ns * nr of samples in each data record + ns * 32 ascii : ns * reserved + + DATA RECORD + nr of samples[1] * integer : first signal in the data record + nr of samples[2] * integer : second signal + .. + .. + nr of samples[ns] * integer : last signal + + Bytes 0 - 127: descriptive text + Bytes 128 - 131: master tag (data type = matrix) + Bytes 132 - 135: master tag (data size) + Bytes 136 - 151: array flags (4 byte tag with data type, 4 byte + tag with subelement size, 8 bytes of content) + Bytes 152 - 167: array dimension (4 byte tag with data type, 4 + byte tag with subelement size, 8 bytes of content) + Bytes 168 - 183: array name (4 byte tag with data type, 4 byte + tag with subelement size, 8 bytes of content) + Bytes 184 - ...: array content (4 byte tag with data type, 4 byte + tag with subelement size, ... bytes of content) + + Examples + -------- + >>> wfdb.wfdb_to_edf('100', pn_dir='pwave') + + The output file name is '100.edf' + + """ + record = rdrecord( + record_name, + pn_dir=pn_dir, + sampfrom=sampfrom, + sampto=sampto, + smooth_frames=False, + ) + record_name_out = record_name.split(os.sep)[-1].replace("-", "_") + + # Maximum data block length, in bytes + edf_max_block = 61440 + + # Convert to the expected month name formatting + month_names = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + + # Calculate block duration. (In the EDF spec, blocks are called "records" + # or "data records", but this would be confusing here since "record" + # refers to the entire recording -- so here we say "blocks".) + samples_per_frame = sum(record.samps_per_frame) + + # i.e., The number of frames per minute, divided by 60 + frames_per_minute = record.fs * 60 + 0.5 + frames_per_second = frames_per_minute / 60 + + # Ten seconds + frames_per_block = 10 * frames_per_second + 0.5 + + # EDF specifies 2 bytes per sample + bytes_per_block = int(2 * samples_per_frame * frames_per_block) + + # Blocks would be too long -- reduce their length by a factor of 10 + while bytes_per_block > edf_max_block: + frames_per_block /= 10 + bytes_per_block = samples_per_frame * 2 * frames_per_block + + seconds_per_block = int(frames_per_block / frames_per_second) + + if (frames_per_block < 1) and (bytes_per_block < edf_max_block / 60): + # The number of frames/minute + frames_per_block = frames_per_minute + bytes_per_block = 2 * samples_per_frame * frames_per_block + seconds_per_block = 60 + + if bytes_per_block > edf_max_block: + print( + ( + "Can't convert record %s to EDF: EDF blocks can't be larger " + "than {} bytes, but each input frame requires {} bytes. Use " + "'channels' to select a subset of the input signals or trim " + "using 'sampfrom' and 'sampto'." + ).format(edf_max_block, samples_per_frame * 2) + ) + + # Calculate the number of blocks to be written. The calculation rounds + # up so that we don't lose any frames, even if the number of frames is not + # an exact multiple of frames_per_block + total_frames = record.sig_len + num_blocks = int(total_frames / int(frames_per_block)) + 1 + + digital_min = [] + digital_max = [] + physical_min = [] + physical_max = [] + # Calculate the physical and digital extrema + for i in range(record.n_sig): + # Invalid ADC resolution in input .hea file + if record.adc_res[i] < 1: + # Guess the ADC resolution based on format + if record.fmt[i] == "24": + temp_adc_res = 24 + elif record.fmt[i] == "32": + temp_adc_res = 32 + elif record.fmt[i] == "80": + temp_adc_res = 8 + elif record.fmt[i] == "212": + temp_adc_res = 12 + elif (record.fmt[i] == "310") or (record.fmt[i] == "311"): + temp_adc_res = 10 + else: + temp_adc_res = 16 + else: + temp_adc_res = record.adc_res[i] + # Determine the physical and digital extrema + digital_max.append( + int(record.adc_zero[i] + (1 << (temp_adc_res - 1)) - 1) + ) + digital_min.append(int(record.adc_zero[i] - (1 << (temp_adc_res - 1)))) + physical_max.append( + (digital_max[i] - record.baseline[i]) / record.adc_gain[i] + ) + physical_min.append( + (digital_min[i] - record.baseline[i]) / record.adc_gain[i] + ) + + # The maximum record name length to write is 80 bytes + if len(record_name_out) > 80: + record_name_write = record_name_out[:79] + "\0" + else: + record_name_write = record_name_out + + # The maximum seconds per block length to write is 8 bytes + if len(str(seconds_per_block)) > 8: + seconds_per_block_write = seconds_per_block[:7] + "\0" + else: + seconds_per_block_write = seconds_per_block + + # The maximum signal name length to write is 16 bytes + sig_name_write = len(record.sig_name) * [] + for s in record.sig_name: + if len(s) > 16: + sig_name_write.append(s[:15] + "\0") + else: + sig_name_write.append(s) + + # The maximum units length to write is 8 bytes + units_write = len(record.units) * [] + for s in record.units: + if len(s) > 8: + units_write.append(s[:7] + "\0") + else: + units_write.append(s) + + # Configure the output datetime + if hasattr("record", "base_datetime"): + start_second = int(record.base_datetime.second) + start_minute = int(record.base_datetime.minute) + start_hour = int(record.base_datetime.hour) + start_day = int(record.base_datetime.day) + start_month = int(record.base_datetime.month) + start_year = int(record.base_datetime.year) + else: + # Set date to start of EDF epoch + start_second = 0 + start_minute = 0 + start_hour = 0 + start_day = 1 + start_month = 1 + start_year = 1985 + + # Determine the number of bytes in the header + header_bytes = 256 * (record.n_sig + 1) + + # Determine the number of samples per data record + samps_per_record = [] + for spf in record.samps_per_frame: + samps_per_record.append(int(frames_per_block) * spf) + + # Determine the output data + # NOTE: The output data will be close (+-1) but not equal due to the + # inappropriate rounding done by record.adc() + # For example... + # Mismatched elements: 862881 / 24168000 (3.57%) + # Max absolute difference: 1 + # Max relative difference: 0.0212766 + # x: array([ 53, -28, 14, ..., 884, 898, 898], dtype=int16) + # y: array([ 53, -28, 14, ..., 884, 898, 898], dtype=int16) + if record.e_p_signal is not None: + temp_data = record.adc(expanded=True) + else: + temp_data = record.adc() + temp_data = [v for v in np.transpose(temp_data)] + + out_data = [] + for i in range(record.sig_len): + for j, sig in enumerate(temp_data): + ind_start = i * samps_per_record[j] + ind_stop = (i + 1) * samps_per_record[j] + out_data.extend(sig[ind_start:ind_stop].tolist()) + out_data = np.array(out_data, dtype=np.int16) + + # Start writing the file + if output_filename == "": + output_filename = record_name_out + ".edf" + + with open(output_filename, "wb") as f: + + print( + "Converting record {} to {} ({} mode)".format( + record_name, output_filename, "EDF+" if edf_plus else "EDF" + ) + ) + + # Version of this data format (8 bytes) + f.write(struct.pack("<8s", b"0").replace(b"\x00", b"\x20")) + + # Local patient identification (80 bytes) + f.write( + struct.pack( + "<80s", "{}".format(record_name_write).encode("ascii") + ).replace(b"\x00", b"\x20") + ) + + # Local recording identification (80 bytes) + # Bob Kemp recommends using this field to encode the start date + # including an abbreviated month name in English and a full (4-digit) + # year, as is done here if this information is available in the input + # record. EDF+ requires this. + if hasattr("record", "base_datetime"): + f.write( + struct.pack( + "<80s", + "Startdate {}-{}-{}".format( + start_day, month_names[start_month - 1], start_year + ).encode("ascii"), + ).replace(b"\x00", b"\x20") + ) + else: + f.write( + struct.pack("<80s", b"Startdate not recorded").replace( + b"\x00", b"\x20" + ) + ) + if edf_plus: + print("WARNING: EDF+ requires start date (not specified)") + + # Start date of recording (dd.mm.yy) (8 bytes) + f.write( + struct.pack( + "<8s", + "{:02d}.{:02d}.{:02d}".format( + start_day, start_month, start_year % 100 + ).encode("ascii"), + ).replace(b"\x00", b"\x20") + ) + + # Start time of recording (hh.mm.ss) (8 bytes) + f.write( + struct.pack( + "<8s", + "{:02d}.{:02d}.{:02d}".format( + start_hour, start_minute, start_second + ).encode("ascii"), + ).replace(b"\x00", b"\x20") + ) + + # Number of bytes in header (8 bytes) + f.write( + struct.pack( + "<8s", "{:d}".format(header_bytes).encode("ascii") + ).replace(b"\x00", b"\x20") + ) + + # Reserved (44 bytes) + if edf_plus: + f.write(struct.pack("<44s", b"EDF+C").replace(b"\x00", b"\x20")) + else: + f.write(struct.pack("<44s", b"").replace(b"\x00", b"\x20")) + + # Number of blocks (-1 if unknown) (8 bytes) + f.write( + struct.pack( + "<8s", "{:d}".format(num_blocks).encode("ascii") + ).replace(b"\x00", b"\x20") + ) + + # Duration of a block, in seconds (8 bytes) + f.write( + struct.pack( + "<8s", "{:g}".format(seconds_per_block_write).encode("ascii") + ).replace(b"\x00", b"\x20") + ) + + # Number of signals (4 bytes) + f.write( + struct.pack( + "<4s", "{:d}".format(record.n_sig).encode("ascii") + ).replace(b"\x00", b"\x20") + ) + + # Label (e.g., EEG FpzCz or Body temp) (16 bytes each) + for i in sig_name_write: + f.write( + struct.pack("<16s", "{}".format(i).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # Transducer type (e.g., AgAgCl electrode) (80 bytes each) + for _ in range(record.n_sig): + f.write( + struct.pack("<80s", b"transducer type not recorded").replace( + b"\x00", b"\x20" + ) + ) + + # Physical dimension (e.g., uV or degreeC) (8 bytes each) + for i in units_write: + f.write( + struct.pack("<8s", "{}".format(i).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # Physical minimum (e.g., -500 or 34) (8 bytes each) + for pmin in physical_min: + f.write( + struct.pack("<8s", "{:g}".format(pmin).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # Physical maximum (e.g., 500 or 40) (8 bytes each) + for pmax in physical_max: + f.write( + struct.pack("<8s", "{:g}".format(pmax).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # Digital minimum (e.g., -2048) (8 bytes each) + for dmin in digital_min: + f.write( + struct.pack("<8s", "{:d}".format(dmin).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # Digital maximum (e.g., 2047) (8 bytes each) + for dmax in digital_max: + f.write( + struct.pack("<8s", "{:d}".format(dmax).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # Prefiltering (e.g., HP:0.1Hz LP:75Hz) (80 bytes each) + for _ in range(record.n_sig): + f.write( + struct.pack("<80s", b"prefiltering not recorded").replace( + b"\x00", b"\x20" + ) + ) + + # Number of samples per block (8 bytes each) + for spr in samps_per_record: + f.write( + struct.pack("<8s", "{:d}".format(spr).encode("ascii")).replace( + b"\x00", b"\x20" + ) + ) + + # The last 32*nsig bytes in the header are unused + for _ in range(record.n_sig): + f.write(struct.pack("<32s", b"").replace(b"\x00", b"\x20")) + + # Write the data blocks + out_data.tofile(f, format="%d") + + # Add the buffer + correct_bytes = num_blocks * sum(samps_per_record) + current_bytes = len(out_data) + num_to_write = correct_bytes - current_bytes + for i in range(num_to_write): + f.write(b"\x00\x80") + + print("Header block size: {:d} bytes".format((record.n_sig + 1) * 256)) + print( + "Data block size: {:g} seconds ({:d} frames or {:d} bytes)".format( + seconds_per_block, int(frames_per_block), int(bytes_per_block) + ) + ) + print( + "Recording length: {:d} ({:d} data blocks, {:d} frames, {:d} bytes)".format( + sum( + [ + num_blocks, + num_blocks * int(frames_per_block), + num_blocks * bytes_per_block, + ] + ), + num_blocks, + num_blocks * int(frames_per_block), + num_blocks * bytes_per_block, + ) + ) + print( + "Total length of file to be written: {:d} bytes".format( + int((record.n_sig + 1) * 256 + num_blocks * bytes_per_block) + ) + ) + + if edf_plus: + print( + ( + "WARNING: EDF+ requires the subject's gender, birthdate, and name, as " + "well as additional information about the recording that is not usually " + "available. This information is not saved in the output file even if " + "available. EDF+ also requires the use of standard names for signals and " + "for physical units; these requirements are not enforced by this program. " + "To make the output file fully EDF+ compliant, its header must be edited " + "manually." + ) + ) + + if "EDF-Annotations" not in record.sig_name: + print( + "WARNING: The output file does not include EDF annotations, which are required for EDF+." + ) + + # Check that all characters in the header are valid (printable ASCII + # between 32 and 126 inclusive). Note that this test does not prevent + # generation of files containing invalid characters; it merely warns + # the user if this has happened. + header_test = open(output_filename, "rb").read((record.n_sig + 1) * 256) + for i, val in enumerate(header_test): + if (val < 32) or (val > 126): + print( + "WARNING: output contains an invalid character, {}, at byte {}".format( + val, i + ) + ) diff --git a/wfdb/io/convert/matlab.py b/wfdb/io/convert/matlab.py new file mode 100644 index 00000000..61d06deb --- /dev/null +++ b/wfdb/io/convert/matlab.py @@ -0,0 +1,310 @@ +import datetime +import os +import struct + +import numpy as np + +from wfdb.io.record import rdrecord + + +def wfdb_to_mat( + record_name, pn_dir=None, sampfrom=0, sampto=None, channels=None +): + """ + This program converts the signals of any PhysioNet record (or one in any + compatible format) into a .mat file that can be read directly using any version + of Matlab, and a short text file containing information about the signals + (names, gains, baselines, units, sampling frequency, and start time/date if + known). If the input record name is REC, the output files are RECm.mat and + RECm.hea. The output files can also be read by any WFDB application as record + RECm. + + This program does not convert annotation files; for that task, 'rdann' is + recommended. + + The output .mat file contains a single matrix named `val` containing raw + (unshifted, unscaled) samples from the selected record. Using various options, + you can select any time interval within a record, or any subset of the signals, + which can be rearranged as desired within the rows of the matrix. Since .mat + files are written in column-major order (i.e., all of column n precedes all of + column n+1), each vector of samples is written as a column rather than as a + row, so that the column number in the .mat file equals the sample number in the + input record (minus however many samples were skipped at the beginning of the + record, as specified using the `start_time` option). If this seems odd, transpose + your matrix after reading it! + + This program writes version 5 MAT-file format output files, as documented in + http://www.mathworks.com/access/helpdesk/help/pdf_doc/matlab/matfile_format.pdf + The samples are written as 32-bit signed integers (mattype=20 below) in + little-endian format if the record contains any format 24 or format 32 signals, + as 8-bit unsigned integers (mattype=50) if the record contains only format 80 + signals, or as 16-bit signed integers in little-endian format (mattype=30) + otherwise. + + The maximum size of the output variable is 2^31 bytes. `wfdb2mat` from versions + 10.5.24 and earlier of the original WFDB software package writes version 4 MAT- + files which have the additional constraint of 100,000,000 elements per variable. + + The output files (recordm.mat + recordm.hea) are still WFDB-compatible, given + the .hea file constructed by this program. + + Parameters + ---------- + record_name : str + The name of the input WFDB record to be read. + pn_dir : str, optional + Option used to stream data from Physionet. The Physionet + database directory from which to find the required record files. + eg. For record '100' in 'http://physionet.org/content/mitdb' + pn_dir='mitdb'. + sampfrom : int, optional + The starting sample number to read for all channels. + sampto : int, 'end', optional + The sample number at which to stop reading for all channels. + Reads the entire duration by default. + channels : list, optional + List of integer indices specifying the channels to be read. + Reads all channels by default. + + Returns + ------- + N/A + + Notes + ----- + The entire file is composed of: + + Bytes 0 - 127: descriptive text + Bytes 128 - 131: master tag (data type = matrix) + Bytes 132 - 135: master tag (data size) + Bytes 136 - 151: array flags (4 byte tag with data type, 4 byte + tag with subelement size, 8 bytes of content) + Bytes 152 - 167: array dimension (4 byte tag with data type, 4 + byte tag with subelement size, 8 bytes of content) + Bytes 168 - 183: array name (4 byte tag with data type, 4 byte + tag with subelement size, 8 bytes of content) + Bytes 184 - ...: array content (4 byte tag with data type, 4 byte + tag with subelement size, ... bytes of content) + + Examples + -------- + >>> wfdb2mat('100', pn_dir='pwave') + + The output file name is 100m.mat and 100m.hea + + """ + record = rdrecord( + record_name, pn_dir=pn_dir, sampfrom=sampfrom, sampto=sampto + ) + record_name_out = record_name.split(os.sep)[-1].replace("-", "_") + "m" + + # Some variables describing the format of the .mat file + field_version = 256 # 0x0100 or 256 + endian_indicator = b"IM" # little endian + master_type = 14 # matrix + sub1_type = 6 # UINT32 + sub2_type = 5 # INT32 + sub3_type = 1 # INT8 + sub1_class = 6 # double precision array + + # Determine if we can write 8-bit unsigned samples, or if 16 or 32 bits + # are needed per sample + bytes_per_element = 1 + for i in range(record.n_sig): + if record.adc_res[i] > 0: + if record.adc_res[i] > 16: + bytes_per_element = 4 + elif (record.adc_res[i] > 8) and (bytes_per_element < 2): + bytes_per_element = 2 + else: + # adc_res not specified.. try to guess from format + if (record.fmt[i] == "24") or (record.fmt[i] == "32"): + bytes_per_element = 4 + elif (record.fmt[i] != "80") and (bytes_per_element < 2): + bytes_per_element = 2 + + if bytes_per_element == 1: + sub4_type = 2 # MAT8 + out_type = " max_length: + raise Exception("Can't write .mat file: data size exceeds 2GB limit") + + # Bytes of actual data + bytes_of_data = bytes_per_element * record.n_sig * desired_length + # This is the remaining number of bytes that don't fit into integer + # multiple of 8: i.e. if 18 bytes, bytes_remain = 2, from 17 to 18 + bytes_remain = bytes_of_data % 8 + + # master_bytes = (8 + 8) + (8 + 8) + (8 + 8) + (8 + bytes_of_data) + padding + # Must be integer multiple 8 + if bytes_remain == 0: + master_bytes = bytes_of_data + 56 + else: + master_bytes = bytes_of_data + 64 - (bytes_remain) + + # Start writing the file + output_file = record_name_out + ".mat" + with open(output_file, "wb") as f: + # Descriptive text (124 bytes) + f.write(struct.pack("<124s", b"MATLAB 5.0")) + # Version (2 bytes) + f.write(struct.pack(">> wfdb_to_wav('100', pn_dir='pwave') + + The output file name is '100.wav' + + """ + record = rdrecord( + record_name, + pn_dir=pn_dir, + sampfrom=sampfrom, + sampto=sampto, + smooth_frames=False, + ) + record_name_out = record_name.split(os.sep)[-1].replace("-", "_") + + # Get information needed for the header and format chunks + num_samps = record.sig_len + samps_per_second = record.fs + frame_length = record.n_sig * 2 + chunk_bytes = num_samps * frame_length + file_bytes = chunk_bytes + 36 + bits_per_sample = max(record.adc_res) + offset = record.adc_zero + shift = [(16 - v) for v in record.adc_res] + + # Start writing the file + if output_filename != "": + if not output_filename.endswith(".wav"): + raise Exception("Name of output file must end in '.wav'") + else: + output_filename = record_name_out + ".wav" + + with open(output_filename, "wb") as f: + # Write the WAV file identifier + f.write(struct.pack(">4s", b"RIFF")) + # Write the number of bytes to follow in the file + # (num_samps*frame_length) sample bytes, and 36 more bytes of miscellaneous embedded header + f.write(struct.pack("8s", b"WAVEfmt ")) + # Number of bytes to follow in the format chunk + f.write(struct.pack("4s", b"data")) + # The number of bytes in the signal data chunk + f.write(struct.pack(">> record = read_wav('sample-data/SC4001E0-PSG.wav') + + """ + if not record_name.endswith(".wav"): + raise Exception("Name of the input file must end in .wav") + + if pn_dir is not None: + + if "." not in pn_dir: + dir_list = pn_dir.split("/") + pn_dir = posixpath.join( + dir_list[0], get_version(dir_list[0]), *dir_list[1:] + ) + + file_url = posixpath.join(download.PN_INDEX_URL, pn_dir, record_name) + # Currently must download file to read it though can give the + # user the option to delete it immediately afterwards + with _url.openurl(file_url, "rb") as f: + open(record_name, "wb").write(f.read()) + + wave_file = open(record_name, mode="rb") + record_name_out = ( + record_name.split(os.sep)[-1].replace("-", "_").replace(".wav", "") + ) + + chunk_ID = "".join( + [s.decode() for s in struct.unpack(">4s", wave_file.read(4))] + ) + if chunk_ID != "RIFF": + raise Exception("{} is not a .wav-format file".format(record_name)) + + correct_chunk_size = os.path.getsize(record_name) - 8 + chunk_size = struct.unpack("4s", wave_file.read(4))[0].decode() + if fmt != "WAVE": + raise Exception("{} is not a .wav-format file".format(record_name)) + + subchunk1_ID = struct.unpack(">4s", wave_file.read(4))[0].decode() + if subchunk1_ID != "fmt ": + raise Exception("Format chunk missing or corrupt") + + subchunk1_size = struct.unpack(" 1: + print("PCM has compression of {}".format(audio_format)) + + if (subchunk1_size != 16) or (audio_format != 1): + raise Exception("Unsupported format {}".format(audio_format)) + + num_channels = struct.unpack("4s", wave_file.read(4))[0].decode() + if subchunk2_ID != "data": + raise Exception("Format chunk missing or corrupt") + + correct_subchunk2_size = os.path.getsize(record_name) - 44 + subchunk2_size = struct.unpack(">> edf_record = wfdb.edf2mit('x001_FAROS.edf', - pn_dir='simultaneous-measurements/raw_data') - - """ - if pn_dir is not None: - - if "." not in pn_dir: - dir_list = pn_dir.split("/") - pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] - ) - - file_url = posixpath.join(download.PN_INDEX_URL, pn_dir, record_name) - # Currently must download file for MNE to read it though can give the - # user the option to delete it immediately afterwards - with _url.openurl(file_url, "rb") as f: - open(record_name, "wb").write(f.read()) - - # Open the desired file - edf_file = open(record_name, mode="rb") - - # Remove the file if the `delete_file` flag is set - if pn_dir is not None and delete_file: - os.remove(record_name) - - # Version of this data format (8 bytes) - version = struct.unpack("<8s", edf_file.read(8))[0].decode() - - # Check to see that the input is an EDF file. (This check will detect - # most but not all other types of files.) - if version != "0 ": - raise Exception( - "Input does not appear to be EDF -- no conversion attempted" - ) - else: - if verbose: - print("EDF version number: {}".format(version.strip())) - - # Local patient identification (80 bytes) - patient_id = struct.unpack("<80s", edf_file.read(80))[0].decode() - if verbose: - print("Patient ID: {}".format(patient_id)) - - # Local recording identification (80 bytes) - # Bob Kemp recommends using this field to encode the start date - # including an abbreviated month name in English and a full (4-digit) - # year, as is done here if this information is available in the input - # record. EDF+ requires this. - record_id = struct.unpack("<80s", edf_file.read(80))[0].decode() - if verbose: - print("Recording ID: {}".format(record_id)) - - # Start date of recording (dd.mm.yy) (8 bytes) - start_date = struct.unpack("<8s", edf_file.read(8))[0].decode() - if verbose: - print("Recording Date: {}".format(start_date)) - start_day, start_month, start_year = [int(i) for i in start_date.split(".")] - # This should work for a while - if start_year < 1970: - start_year += 1900 - if start_year < 1970: - start_year += 100 - - # Start time of recording (hh.mm.ss) (8 bytes) - start_time = struct.unpack("<8s", edf_file.read(8))[0].decode() - if verbose: - print("Recording Time: {}".format(start_time)) - start_hour, start_minute, start_second = [ - int(i) for i in start_time.split(".") - ] - - # Number of bytes in header (8 bytes) - header_bytes = int(struct.unpack("<8s", edf_file.read(8))[0].decode()) - if verbose: - print("Number of bytes in header record: {}".format(header_bytes)) - - # Reserved (44 bytes) - reserved_notes = ( - struct.unpack("<44s", edf_file.read(44))[0].decode().strip() - ) - if reserved_notes[:5] == "EDF+C": - # The file is EDF compatible and will work without issue - # See: Bob Kemp, Jesus Olivan, European data format ‘plus’ (EDF+), an - # EDF alike standard format for the exchange of physiological - # data, Clinical Neurophysiology, Volume 114, Issue 9, 2003, - # Pages 1755-1761, ISSN 1388-2457 - pass - elif reserved_notes[:5] == "EDF+D": - raise Exception( - "EDF+ File: interrupted data records (not currently supported)" - ) - else: - if verbose: - print("Free Space: {}".format(reserved_notes)) - - # Number of blocks (-1 if unknown) (8 bytes) - num_blocks = int(struct.unpack("<8s", edf_file.read(8))[0].decode()) - if verbose: - print("Number of data records: {}".format(num_blocks)) - if num_blocks == -1: - raise Exception( - "Number of data records in unknown (not currently supported)" - ) - - # Duration of a block, in seconds (8 bytes) - block_duration = float(struct.unpack("<8s", edf_file.read(8))[0].decode()) - if verbose: - print( - "Duration of each data record in seconds: {}".format(block_duration) - ) - if block_duration <= 0.0: - block_duration = 1.0 - - # Number of signals (4 bytes) - n_sig = int(struct.unpack("<4s", edf_file.read(4))[0].decode()) - if verbose: - print("Number of signals: {}".format(n_sig)) - if n_sig < 1: - raise Exception("Done: not any signals left to read") - - # Label (e.g., EEG FpzCz or Body temp) (16 bytes each) - sig_name = [] - for _ in range(n_sig): - temp_sig = struct.unpack("<16s", edf_file.read(16))[0].decode().strip() - if temp_sig == "EDF Annotations" and not rdedfann_flag: - print( - "*** This may be an EDF+ Annotation file instead, please see " - "the `rdedfann` function. ***" - ) - sig_name.append(temp_sig) - if verbose: - print("Signal Labels: {}".format(sig_name)) - - # Transducer type (e.g., AgAgCl electrode) (80 bytes each) - transducer_types = [] - for _ in range(n_sig): - transducer_types.append( - struct.unpack("<80s", edf_file.read(80))[0].decode().strip() - ) - if verbose: - print("Transducer Types: {}".format(transducer_types)) - - # Physical dimension (e.g., uV or degreeC) (8 bytes each) - physical_dims = [] - for _ in range(n_sig): - physical_dims.append( - struct.unpack("<8s", edf_file.read(8))[0].decode().strip() - ) - if verbose: - print("Physical Dimensions: {}".format(physical_dims)) - - # Physical minimum (e.g., -500 or 34) (8 bytes each) - physical_min = np.array([]) - for _ in range(n_sig): - physical_min = np.append( - physical_min, - float(struct.unpack("<8s", edf_file.read(8))[0].decode()), - ) - if verbose: - print("Physical Minimums: {}".format(physical_min)) - - # Physical maximum (e.g., 500 or 40) (8 bytes each) - physical_max = np.array([]) - for _ in range(n_sig): - physical_max = np.append( - physical_max, - float(struct.unpack("<8s", edf_file.read(8))[0].decode()), - ) - if verbose: - print("Physical Maximums: {}".format(physical_max)) - - # Digital minimum (e.g., -2048) (8 bytes each) - digital_min = np.array([]) - for _ in range(n_sig): - digital_min = np.append( - digital_min, - float(struct.unpack("<8s", edf_file.read(8))[0].decode()), - ) - if verbose: - print("Digital Minimums: {}".format(digital_min)) - - # Digital maximum (e.g., 2047) (8 bytes each) - digital_max = np.array([]) - for _ in range(n_sig): - digital_max = np.append( - digital_max, - float(struct.unpack("<8s", edf_file.read(8))[0].decode()), - ) - if verbose: - print("Digital Maximums: {}".format(digital_max)) - - # Prefiltering (e.g., HP:0.1Hz LP:75Hz) (80 bytes each) - prefilter_info = [] - for _ in range(n_sig): - prefilter_info.append( - struct.unpack("<80s", edf_file.read(80))[0].decode().strip() - ) - if verbose: - print("Prefiltering Information: {}".format(prefilter_info)) - - # Number of samples per block (8 bytes each) - samps_per_block = [] - for _ in range(n_sig): - samps_per_block.append( - int(struct.unpack("<8s", edf_file.read(8))[0].decode()) - ) - if verbose: - print("Number of Samples per Record: {}".format(samps_per_block)) - - # The last 32*nsig bytes in the header are unused - for _ in range(n_sig): - struct.unpack("<32s", edf_file.read(32))[0].decode() - - # Pre-process the acquired data before creating the record - record_name_out = ( - record_name.split(os.sep)[-1].replace("-", "_").replace(".edf", "") - ) - sample_rate = [int(i / block_duration) for i in samps_per_block] - fs = functools.reduce(math.gcd, sample_rate) - samps_per_frame = [int(s / min(samps_per_block)) for s in samps_per_block] - sig_len = int(fs * num_blocks * block_duration) - base_time = datetime.time(start_hour, start_minute, start_second) - base_date = datetime.date(start_year, start_month, start_day) - file_name = n_sig * [record_name_out + ".dat"] - fmt = n_sig * ["16"] - skew = n_sig * [None] - byte_offset = n_sig * [None] - adc_gain_all = (digital_max - digital_min) / (physical_max - physical_min) - adc_gain = [float(format(a, ".12g")) for a in adc_gain_all] - baseline = (digital_max - (physical_max * adc_gain_all) + 1).astype("int64") - - units = n_sig * [""] - for i, f in enumerate(physical_dims): - if f == "n/a": - label = sig_name[i].lower().split()[0] - if label in list(SIG_UNITS.keys()): - units[i] = SIG_UNITS[label] - else: - units[i] = "n/a" - else: - f = f.replace("µ", "u") # Maybe more weird symbols to check for? - if f == "": - units[i] = "mV" - else: - units[i] = f - - adc_res = [int(math.log2(f)) for f in (digital_max - digital_min)] - adc_zero = [int(f) for f in ((digital_max + 1 + digital_min) / 2)] - block_size = n_sig * [0] - base_datetime = datetime.datetime( - start_year, - start_month, - start_day, - start_hour, - start_minute, - start_second, - ) - base_time = datetime.time( - base_datetime.hour, base_datetime.minute, base_datetime.second - ) - base_date = datetime.date( - base_datetime.year, base_datetime.month, base_datetime.day - ) - - if header_only: - return { - "fs": fs, - "sig_len": sig_len, - "n_sig": n_sig, - "base_date": base_date, - "base_time": base_time, - "units": units, - "sig_name": sig_name, - "comments": [], - } - - sig_data = np.empty((sig_len, n_sig)) - temp_sig_data = np.fromfile(edf_file, dtype=np.int16) - temp_sig_data = temp_sig_data.reshape((-1, sum(samps_per_block))) - temp_all_sigs = np.hsplit(temp_sig_data, np.cumsum(samps_per_block)[:-1]) - for i in range(n_sig): - # Check if `samps_per_frame` has all equal values - if samps_per_frame.count(samps_per_frame[0]) == len(samps_per_frame): - sig_data[:, i] = ( - temp_all_sigs[i].flatten() - baseline[i] - ) / adc_gain_all[i] - else: - temp_sig_data = temp_all_sigs[i].flatten() - if samps_per_frame[i] == 1: - sig_data[:, i] = (temp_sig_data - baseline[i]) / adc_gain_all[i] - else: - for j in range(sig_len): - start_ind = j * samps_per_frame[i] - stop_ind = start_ind + samps_per_frame[i] - sig_data[j, i] = np.mean( - (temp_sig_data[start_ind:stop_ind] - baseline[i]) - / adc_gain_all[i] - ) - - # This is the closest I can get to the original implementation - # NOTE: This is done using `np.testing.assert_array_equal()` - # Mismatched elements: 15085545 / 15400000 (98%) - # Max absolute difference: 3.75166564e-12 - # Max relative difference: 5.41846079e-15 - # x: array([[ -3.580728, 42.835293, -102.818048, 54.978632, -52.354247], - # [ -8.340205, 43.079939, -102.106351, 56.402027, -44.992626], - # [ -5.004123, 43.546991, -99.481966, 51.64255 , -43.079939],... - # y: array([[ -3.580728, 42.835293, -102.818048, 54.978632, -52.354247], - # [ -8.340205, 43.079939, -102.106351, 56.402027, -44.992626], - # [ -5.004123, 43.546991, -99.481966, 51.64255 , -43.079939],... - - init_value = [int(s[0, 0]) for s in temp_all_sigs] - checksum = [ - int(np.sum(v) % 65536) for v in np.transpose(sig_data) - ] # not all values correct? - - record = Record( - record_name=record_name_out, - n_sig=n_sig, - fs=fs, - samps_per_frame=samps_per_frame, - counter_freq=None, - base_counter=None, - sig_len=sig_len, - base_time=base_time, - base_date=base_date, - comments=[], - sig_name=sig_name, # Remove whitespace to make compatible later? - p_signal=sig_data, - d_signal=None, - e_p_signal=None, - e_d_signal=None, - file_name=n_sig * [record_name_out + ".dat"], - fmt=n_sig * ["16"], - skew=n_sig * [None], - byte_offset=n_sig * [None], - adc_gain=adc_gain, - baseline=baseline, - units=units, - adc_res=[int(math.log2(f)) for f in (digital_max - digital_min)], - adc_zero=[int(f) for f in ((digital_max + 1 + digital_min) / 2)], - init_value=init_value, - checksum=checksum, - block_size=n_sig * [0], - ) - - record.base_datetime = base_datetime - - if record_only: - return record - else: - # TODO: Generate the .dat and .hea files - pass - - -def mit2edf( - record_name, - pn_dir=None, - sampfrom=0, - sampto=None, - channels=None, - output_filename="", - edf_plus=False, -): - """ - These programs convert EDF (European Data Format) files into - WFDB-compatible files (as used in PhysioNet) and vice versa. European - Data Format (EDF) was originally designed for storage of polysomnograms. - - Note that WFDB format does not include a standard way to specify the - transducer type or the prefiltering specification; these parameters are - not preserved by these conversion programs. Also note that use of the - standard signal and unit names specified for EDF is permitted but not - enforced by `mit2edf`. - - Parameters - ---------- - record_name : str - The name of the input WFDB record to be read. Can also work with both - EDF and WAV files. - pn_dir : str, optional - Option used to stream data from Physionet. The Physionet - database directory from which to find the required record files. - eg. For record '100' in 'http://physionet.org/content/mitdb' - pn_dir='mitdb'. - sampfrom : int, optional - The starting sample number to read for all channels. - sampto : int, 'end', optional - The sample number at which to stop reading for all channels. - Reads the entire duration by default. - channels : list, optional - List of integer indices specifying the channels to be read. - Reads all channels by default. - output_filename : str, optional - The desired name of the output file. If this value set to the - default value of '', then the output filename will be 'REC.edf'. - edf_plus : bool, optional - Whether to write the output file in EDF (False) or EDF+ (True) format. - - Returns - ------- - N/A - - Notes - ----- - The entire file is composed of (seen here: https://www.edfplus.info/specs/edf.html): - - HEADER RECORD (we suggest to also adopt the 12 simple additional EDF+ specs) - 8 ascii : version of this data format (0) - 80 ascii : local patient identification (mind item 3 of the additional EDF+ specs) - 80 ascii : local recording identification (mind item 4 of the additional EDF+ specs) - 8 ascii : startdate of recording (dd.mm.yy) (mind item 2 of the additional EDF+ specs) - 8 ascii : starttime of recording (hh.mm.ss) - 8 ascii : number of bytes in header record - 44 ascii : reserved - 8 ascii : number of data records (-1 if unknown, obey item 10 of the additional EDF+ specs) - 8 ascii : duration of a data record, in seconds - 4 ascii : number of signals (ns) in data record - ns * 16 ascii : ns * label (e.g. EEG Fpz-Cz or Body temp) (mind item 9 of the additional EDF+ specs) - ns * 80 ascii : ns * transducer type (e.g. AgAgCl electrode) - ns * 8 ascii : ns * physical dimension (e.g. uV or degreeC) - ns * 8 ascii : ns * physical minimum (e.g. -500 or 34) - ns * 8 ascii : ns * physical maximum (e.g. 500 or 40) - ns * 8 ascii : ns * digital minimum (e.g. -2048) - ns * 8 ascii : ns * digital maximum (e.g. 2047) - ns * 80 ascii : ns * prefiltering (e.g. HP:0.1Hz LP:75Hz) - ns * 8 ascii : ns * nr of samples in each data record - ns * 32 ascii : ns * reserved - - DATA RECORD - nr of samples[1] * integer : first signal in the data record - nr of samples[2] * integer : second signal - .. - .. - nr of samples[ns] * integer : last signal - - Bytes 0 - 127: descriptive text - Bytes 128 - 131: master tag (data type = matrix) - Bytes 132 - 135: master tag (data size) - Bytes 136 - 151: array flags (4 byte tag with data type, 4 byte - tag with subelement size, 8 bytes of content) - Bytes 152 - 167: array dimension (4 byte tag with data type, 4 - byte tag with subelement size, 8 bytes of content) - Bytes 168 - 183: array name (4 byte tag with data type, 4 byte - tag with subelement size, 8 bytes of content) - Bytes 184 - ...: array content (4 byte tag with data type, 4 byte - tag with subelement size, ... bytes of content) - - Examples - -------- - >>> wfdb.mit2edf('100', pn_dir='pwave') - - The output file name is '100.edf' - - """ - record = rdrecord( - record_name, - pn_dir=pn_dir, - sampfrom=sampfrom, - sampto=sampto, - smooth_frames=False, - ) - record_name_out = record_name.split(os.sep)[-1].replace("-", "_") - - # Maximum data block length, in bytes - edf_max_block = 61440 - - # Convert to the expected month name formatting - month_names = [ - "JAN", - "FEB", - "MAR", - "APR", - "MAY", - "JUN", - "JUL", - "AUG", - "SEP", - "OCT", - "NOV", - "DEC", - ] - - # Calculate block duration. (In the EDF spec, blocks are called "records" - # or "data records", but this would be confusing here since "record" - # refers to the entire recording -- so here we say "blocks".) - samples_per_frame = sum(record.samps_per_frame) - - # i.e., The number of frames per minute, divided by 60 - frames_per_minute = record.fs * 60 + 0.5 - frames_per_second = frames_per_minute / 60 - - # Ten seconds - frames_per_block = 10 * frames_per_second + 0.5 - - # EDF specifies 2 bytes per sample - bytes_per_block = int(2 * samples_per_frame * frames_per_block) - - # Blocks would be too long -- reduce their length by a factor of 10 - while bytes_per_block > edf_max_block: - frames_per_block /= 10 - bytes_per_block = samples_per_frame * 2 * frames_per_block - - seconds_per_block = int(frames_per_block / frames_per_second) - - if (frames_per_block < 1) and (bytes_per_block < edf_max_block / 60): - # The number of frames/minute - frames_per_block = frames_per_minute - bytes_per_block = 2 * samples_per_frame * frames_per_block - seconds_per_block = 60 - - if bytes_per_block > edf_max_block: - print( - ( - "Can't convert record %s to EDF: EDF blocks can't be larger " - "than {} bytes, but each input frame requires {} bytes. Use " - "'channels' to select a subset of the input signals or trim " - "using 'sampfrom' and 'sampto'." - ).format(edf_max_block, samples_per_frame * 2) - ) - - # Calculate the number of blocks to be written. The calculation rounds - # up so that we don't lose any frames, even if the number of frames is not - # an exact multiple of frames_per_block - total_frames = record.sig_len - num_blocks = int(total_frames / int(frames_per_block)) + 1 - - digital_min = [] - digital_max = [] - physical_min = [] - physical_max = [] - # Calculate the physical and digital extrema - for i in range(record.n_sig): - # Invalid ADC resolution in input .hea file - if record.adc_res[i] < 1: - # Guess the ADC resolution based on format - if record.fmt[i] == "24": - temp_adc_res = 24 - elif record.fmt[i] == "32": - temp_adc_res = 32 - elif record.fmt[i] == "80": - temp_adc_res = 8 - elif record.fmt[i] == "212": - temp_adc_res = 12 - elif (record.fmt[i] == "310") or (record.fmt[i] == "311"): - temp_adc_res = 10 - else: - temp_adc_res = 16 - else: - temp_adc_res = record.adc_res[i] - # Determine the physical and digital extrema - digital_max.append( - int(record.adc_zero[i] + (1 << (temp_adc_res - 1)) - 1) - ) - digital_min.append(int(record.adc_zero[i] - (1 << (temp_adc_res - 1)))) - physical_max.append( - (digital_max[i] - record.baseline[i]) / record.adc_gain[i] - ) - physical_min.append( - (digital_min[i] - record.baseline[i]) / record.adc_gain[i] - ) - - # The maximum record name length to write is 80 bytes - if len(record_name_out) > 80: - record_name_write = record_name_out[:79] + "\0" - else: - record_name_write = record_name_out - - # The maximum seconds per block length to write is 8 bytes - if len(str(seconds_per_block)) > 8: - seconds_per_block_write = seconds_per_block[:7] + "\0" - else: - seconds_per_block_write = seconds_per_block - - # The maximum signal name length to write is 16 bytes - sig_name_write = len(record.sig_name) * [] - for s in record.sig_name: - if len(s) > 16: - sig_name_write.append(s[:15] + "\0") - else: - sig_name_write.append(s) - - # The maximum units length to write is 8 bytes - units_write = len(record.units) * [] - for s in record.units: - if len(s) > 8: - units_write.append(s[:7] + "\0") - else: - units_write.append(s) - - # Configure the output datetime - if hasattr("record", "base_datetime"): - start_second = int(record.base_datetime.second) - start_minute = int(record.base_datetime.minute) - start_hour = int(record.base_datetime.hour) - start_day = int(record.base_datetime.day) - start_month = int(record.base_datetime.month) - start_year = int(record.base_datetime.year) - else: - # Set date to start of EDF epoch - start_second = 0 - start_minute = 0 - start_hour = 0 - start_day = 1 - start_month = 1 - start_year = 1985 - - # Determine the number of bytes in the header - header_bytes = 256 * (record.n_sig + 1) - - # Determine the number of samples per data record - samps_per_record = [] - for spf in record.samps_per_frame: - samps_per_record.append(int(frames_per_block) * spf) - - # Determine the output data - # NOTE: The output data will be close (+-1) but not equal due to the - # inappropriate rounding done by record.adc() - # For example... - # Mismatched elements: 862881 / 24168000 (3.57%) - # Max absolute difference: 1 - # Max relative difference: 0.0212766 - # x: array([ 53, -28, 14, ..., 884, 898, 898], dtype=int16) - # y: array([ 53, -28, 14, ..., 884, 898, 898], dtype=int16) - if record.e_p_signal is not None: - temp_data = record.adc(expanded=True) - else: - temp_data = record.adc() - temp_data = [v for v in np.transpose(temp_data)] - - out_data = [] - for i in range(record.sig_len): - for j, sig in enumerate(temp_data): - ind_start = i * samps_per_record[j] - ind_stop = (i + 1) * samps_per_record[j] - out_data.extend(sig[ind_start:ind_stop].tolist()) - out_data = np.array(out_data, dtype=np.int16) - - # Start writing the file - if output_filename == "": - output_filename = record_name_out + ".edf" - - with open(output_filename, "wb") as f: - - print( - "Converting record {} to {} ({} mode)".format( - record_name, output_filename, "EDF+" if edf_plus else "EDF" - ) - ) - - # Version of this data format (8 bytes) - f.write(struct.pack("<8s", b"0").replace(b"\x00", b"\x20")) - - # Local patient identification (80 bytes) - f.write( - struct.pack( - "<80s", "{}".format(record_name_write).encode("ascii") - ).replace(b"\x00", b"\x20") - ) - - # Local recording identification (80 bytes) - # Bob Kemp recommends using this field to encode the start date - # including an abbreviated month name in English and a full (4-digit) - # year, as is done here if this information is available in the input - # record. EDF+ requires this. - if hasattr("record", "base_datetime"): - f.write( - struct.pack( - "<80s", - "Startdate {}-{}-{}".format( - start_day, month_names[start_month - 1], start_year - ).encode("ascii"), - ).replace(b"\x00", b"\x20") - ) - else: - f.write( - struct.pack("<80s", b"Startdate not recorded").replace( - b"\x00", b"\x20" - ) - ) - if edf_plus: - print("WARNING: EDF+ requires start date (not specified)") - - # Start date of recording (dd.mm.yy) (8 bytes) - f.write( - struct.pack( - "<8s", - "{:02d}.{:02d}.{:02d}".format( - start_day, start_month, start_year % 100 - ).encode("ascii"), - ).replace(b"\x00", b"\x20") - ) - - # Start time of recording (hh.mm.ss) (8 bytes) - f.write( - struct.pack( - "<8s", - "{:02d}.{:02d}.{:02d}".format( - start_hour, start_minute, start_second - ).encode("ascii"), - ).replace(b"\x00", b"\x20") - ) - - # Number of bytes in header (8 bytes) - f.write( - struct.pack( - "<8s", "{:d}".format(header_bytes).encode("ascii") - ).replace(b"\x00", b"\x20") - ) - - # Reserved (44 bytes) - if edf_plus: - f.write(struct.pack("<44s", b"EDF+C").replace(b"\x00", b"\x20")) - else: - f.write(struct.pack("<44s", b"").replace(b"\x00", b"\x20")) - - # Number of blocks (-1 if unknown) (8 bytes) - f.write( - struct.pack( - "<8s", "{:d}".format(num_blocks).encode("ascii") - ).replace(b"\x00", b"\x20") - ) - - # Duration of a block, in seconds (8 bytes) - f.write( - struct.pack( - "<8s", "{:g}".format(seconds_per_block_write).encode("ascii") - ).replace(b"\x00", b"\x20") - ) - - # Number of signals (4 bytes) - f.write( - struct.pack( - "<4s", "{:d}".format(record.n_sig).encode("ascii") - ).replace(b"\x00", b"\x20") - ) - - # Label (e.g., EEG FpzCz or Body temp) (16 bytes each) - for i in sig_name_write: - f.write( - struct.pack("<16s", "{}".format(i).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # Transducer type (e.g., AgAgCl electrode) (80 bytes each) - for _ in range(record.n_sig): - f.write( - struct.pack("<80s", b"transducer type not recorded").replace( - b"\x00", b"\x20" - ) - ) - - # Physical dimension (e.g., uV or degreeC) (8 bytes each) - for i in units_write: - f.write( - struct.pack("<8s", "{}".format(i).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # Physical minimum (e.g., -500 or 34) (8 bytes each) - for pmin in physical_min: - f.write( - struct.pack("<8s", "{:g}".format(pmin).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # Physical maximum (e.g., 500 or 40) (8 bytes each) - for pmax in physical_max: - f.write( - struct.pack("<8s", "{:g}".format(pmax).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # Digital minimum (e.g., -2048) (8 bytes each) - for dmin in digital_min: - f.write( - struct.pack("<8s", "{:d}".format(dmin).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # Digital maximum (e.g., 2047) (8 bytes each) - for dmax in digital_max: - f.write( - struct.pack("<8s", "{:d}".format(dmax).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # Prefiltering (e.g., HP:0.1Hz LP:75Hz) (80 bytes each) - for _ in range(record.n_sig): - f.write( - struct.pack("<80s", b"prefiltering not recorded").replace( - b"\x00", b"\x20" - ) - ) - - # Number of samples per block (8 bytes each) - for spr in samps_per_record: - f.write( - struct.pack("<8s", "{:d}".format(spr).encode("ascii")).replace( - b"\x00", b"\x20" - ) - ) - - # The last 32*nsig bytes in the header are unused - for _ in range(record.n_sig): - f.write(struct.pack("<32s", b"").replace(b"\x00", b"\x20")) - - # Write the data blocks - out_data.tofile(f, format="%d") - - # Add the buffer - correct_bytes = num_blocks * sum(samps_per_record) - current_bytes = len(out_data) - num_to_write = correct_bytes - current_bytes - for i in range(num_to_write): - f.write(b"\x00\x80") - - print("Header block size: {:d} bytes".format((record.n_sig + 1) * 256)) - print( - "Data block size: {:g} seconds ({:d} frames or {:d} bytes)".format( - seconds_per_block, int(frames_per_block), int(bytes_per_block) - ) - ) - print( - "Recording length: {:d} ({:d} data blocks, {:d} frames, {:d} bytes)".format( - sum( - [ - num_blocks, - num_blocks * int(frames_per_block), - num_blocks * bytes_per_block, - ] - ), - num_blocks, - num_blocks * int(frames_per_block), - num_blocks * bytes_per_block, - ) - ) - print( - "Total length of file to be written: {:d} bytes".format( - int((record.n_sig + 1) * 256 + num_blocks * bytes_per_block) - ) - ) - - if edf_plus: - print( - ( - "WARNING: EDF+ requires the subject's gender, birthdate, and name, as " - "well as additional information about the recording that is not usually " - "available. This information is not saved in the output file even if " - "available. EDF+ also requires the use of standard names for signals and " - "for physical units; these requirements are not enforced by this program. " - "To make the output file fully EDF+ compliant, its header must be edited " - "manually." - ) - ) - - if "EDF-Annotations" not in record.sig_name: - print( - "WARNING: The output file does not include EDF annotations, which are required for EDF+." - ) - - # Check that all characters in the header are valid (printable ASCII - # between 32 and 126 inclusive). Note that this test does not prevent - # generation of files containing invalid characters; it merely warns - # the user if this has happened. - header_test = open(output_filename, "rb").read((record.n_sig + 1) * 256) - for i, val in enumerate(header_test): - if (val < 32) or (val > 126): - print( - "WARNING: output contains an invalid character, {}, at byte {}".format( - val, i - ) - ) - - -def mit2wav( - record_name, - pn_dir=None, - sampfrom=0, - sampto=None, - channels=None, - output_filename="", - write_header=False, -): - """ - This program converts a WFDB record into .wav format (format 16, multiplexed - signals, with embedded header information). Use 'wav2mit' to perform the - reverse conversion. - - Parameters - ---------- - record_name : str - The name of the input WFDB record to be read. Can also work with both - EDF and WAV files. - pn_dir : str, optional - Option used to stream data from Physionet. The Physionet - database directory from which to find the required record files. - eg. For record '100' in 'http://physionet.org/content/mitdb' - pn_dir='mitdb'. - sampfrom : int, optional - The starting sample number to read for all channels. - sampto : int, 'end', optional - The sample number at which to stop reading for all channels. - Reads the entire duration by default. - channels : list, optional - List of integer indices specifying the channels to be read. - Reads all channels by default. - output_filename : str, optional - The desired name of the output file. If this value set to the - default value of '', then the output filename will be 'REC.wav'. - write_header : bool, optional - Whether to write (True) or not to write (False) a header file to - accompany the generated WAV file. The default value is 'False'. - - Returns - ------- - N/A - - Notes - ----- - Files that can be processed successfully using `wav2mit` always have exactly - three chunks (a header chunk, a format chunk, and a data chunk). In .wav - files, binary data are always written in little-endian format (least - significant byte first). The format of `wav2mit`'s input files is as follows: - - [Header chunk] - Bytes 0 - 3: "RIFF" [4 ASCII characters] - Bytes 4 - 7: L-8 (number of bytes to follow in the file, excluding bytes 0-7) - Bytes 8 - 11: "WAVE" [4 ASCII characters] - - [Format chunk] - Bytes 12 - 15: "fmt " [4 ASCII characters, note trailing space] - Bytes 16 - 19: 16 (format chunk length in bytes, excluding bytes 12-19) - Bytes 20 - 35: format specification, consisting of: - Bytes 20 - 21: 1 (format tag, indicating no compression is used) - Bytes 22 - 23: number of signals (1 - 65535) - Bytes 24 - 27: sampling frequency in Hz (per signal) - Note that the sampling frequency in a .wav file must be an - integer multiple of 1 Hz, a restriction that is not imposed - by MIT (WFDB) format. - Bytes 28 - 31: bytes per second (sampling frequency * frame size in bytes) - Bytes 32 - 33: frame size in bytes - Bytes 34 - 35: bits per sample (ADC resolution in bits) - Note that the actual ADC resolution (e.g., 12) is written in - this field, although each output sample is right-padded to fill - a full (16-bit) word. (.wav format allows for 8, 16, 24, and - 32 bits per sample) - - [Data chunk] - Bytes 36 - 39: "data" [4 ASCII characters] - Bytes 40 - 43: L-44 (number of bytes to follow in the data chunk) - Bytes 44 - L-1: sample data, consisting of: - Bytes 44 - 45: sample 0, channel 0 - Bytes 46 - 47: sample 0, channel 1 - ... etc. (same order as in a multiplexed WFDB signal file) - - Examples - -------- - >>> wfdb.mit2wav('100', pn_dir='pwave') - - The output file name is '100.wav' - - """ - record = rdrecord( - record_name, - pn_dir=pn_dir, - sampfrom=sampfrom, - sampto=sampto, - smooth_frames=False, - ) - record_name_out = record_name.split(os.sep)[-1].replace("-", "_") - - # Get information needed for the header and format chunks - num_samps = record.sig_len - samps_per_second = record.fs - frame_length = record.n_sig * 2 - chunk_bytes = num_samps * frame_length - file_bytes = chunk_bytes + 36 - bits_per_sample = max(record.adc_res) - offset = record.adc_zero - shift = [(16 - v) for v in record.adc_res] - - # Start writing the file - if output_filename != "": - if not output_filename.endswith(".wav"): - raise Exception("Name of output file must end in '.wav'") - else: - output_filename = record_name_out + ".wav" - - with open(output_filename, "wb") as f: - # Write the WAV file identifier - f.write(struct.pack(">4s", b"RIFF")) - # Write the number of bytes to follow in the file - # (num_samps*frame_length) sample bytes, and 36 more bytes of miscellaneous embedded header - f.write(struct.pack("8s", b"WAVEfmt ")) - # Number of bytes to follow in the format chunk - f.write(struct.pack("4s", b"data")) - # The number of bytes in the signal data chunk - f.write(struct.pack(">> wav_record = wfdb.wav2mit('sample-data/SC4001E0-PSG.wav', record_only=True) - - """ - if not record_name.endswith(".wav"): - raise Exception("Name of the input file must end in .wav") - - if pn_dir is not None: - - if "." not in pn_dir: - dir_list = pn_dir.split("/") - pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] - ) - - file_url = posixpath.join(download.PN_INDEX_URL, pn_dir, record_name) - # Currently must download file to read it though can give the - # user the option to delete it immediately afterwards - with _url.openurl(file_url, "rb") as f: - open(record_name, "wb").write(f.read()) - - wave_file = open(record_name, mode="rb") - record_name_out = ( - record_name.split(os.sep)[-1].replace("-", "_").replace(".wav", "") - ) - - chunk_ID = "".join( - [s.decode() for s in struct.unpack(">4s", wave_file.read(4))] - ) - if chunk_ID != "RIFF": - raise Exception("{} is not a .wav-format file".format(record_name)) - - correct_chunk_size = os.path.getsize(record_name) - 8 - chunk_size = struct.unpack("4s", wave_file.read(4))[0].decode() - if fmt != "WAVE": - raise Exception("{} is not a .wav-format file".format(record_name)) - - subchunk1_ID = struct.unpack(">4s", wave_file.read(4))[0].decode() - if subchunk1_ID != "fmt ": - raise Exception("Format chunk missing or corrupt") - - subchunk1_size = struct.unpack(" 1: - print("PCM has compression of {}".format(audio_format)) - - if (subchunk1_size != 16) or (audio_format != 1): - raise Exception("Unsupported format {}".format(audio_format)) - - num_channels = struct.unpack("4s", wave_file.read(4))[0].decode() - if subchunk2_ID != "data": - raise Exception("Format chunk missing or corrupt") - - correct_subchunk2_size = os.path.getsize(record_name) - 44 - subchunk2_size = struct.unpack(">> wfdb.wfdb2mat('100', pn_dir='pwave') - - The output file name is 100m.mat and 100m.hea - - """ - record = rdrecord( - record_name, pn_dir=pn_dir, sampfrom=sampfrom, sampto=sampto - ) - record_name_out = record_name.split(os.sep)[-1].replace("-", "_") + "m" - - # Some variables describing the format of the .mat file - field_version = 256 # 0x0100 or 256 - endian_indicator = b"IM" # little endian - master_type = 14 # matrix - sub1_type = 6 # UINT32 - sub2_type = 5 # INT32 - sub3_type = 1 # INT8 - sub1_class = 6 # double precision array - - # Determine if we can write 8-bit unsigned samples, or if 16 or 32 bits - # are needed per sample - bytes_per_element = 1 - for i in range(record.n_sig): - if record.adc_res[i] > 0: - if record.adc_res[i] > 16: - bytes_per_element = 4 - elif (record.adc_res[i] > 8) and (bytes_per_element < 2): - bytes_per_element = 2 - else: - # adc_res not specified.. try to guess from format - if (record.fmt[i] == "24") or (record.fmt[i] == "32"): - bytes_per_element = 4 - elif (record.fmt[i] != "80") and (bytes_per_element < 2): - bytes_per_element = 2 - - if bytes_per_element == 1: - sub4_type = 2 # MAT8 - out_type = " max_length: - raise Exception("Can't write .mat file: data size exceeds 2GB limit") - - # Bytes of actual data - bytes_of_data = bytes_per_element * record.n_sig * desired_length - # This is the remaining number of bytes that don't fit into integer - # multiple of 8: i.e. if 18 bytes, bytes_remain = 2, from 17 to 18 - bytes_remain = bytes_of_data % 8 - - # master_bytes = (8 + 8) + (8 + 8) + (8 + 8) + (8 + bytes_of_data) + padding - # Must be integer multiple 8 - if bytes_remain == 0: - master_bytes = bytes_of_data + 56 - else: - master_bytes = bytes_of_data + 64 - (bytes_remain) - - # Start writing the file - output_file = record_name_out + ".mat" - with open(output_file, "wb") as f: - # Descriptive text (124 bytes) - f.write(struct.pack("<124s", b"MATLAB 5.0")) - # Version (2 bytes) - f.write(struct.pack(">> wfdb.csv2mit('sample-data/100.csv', fs=360, units='mV') - - Create the header ('.hea') and record ('.dat') files, change units for - each signal - >>> wfdb.csv2mit('sample-data/100.csv', fs=360, units=['mV','kV']) - - Return just the record, note the use of lists to specify which values should - be applied to each signal - >>> csv_record = wfdb.csv2mit('sample-data/100.csv', fs=360, units=['mV','mV'], - fmt=['80',212'], adc_gain=[100,200], - baseline=[1024,512], record_only=True) - - Return just the record, note the use of single strings and ints to specify - when fields can be applied to all signals - >>> csv_record = wfdb.csv2mit('sample-data/100.csv', fs=360, units='mV', - fmt=['80','212'], adc_gain=200, baseline=1024, - record_only=True) - - """ - # NOTE: No need to write input checks here since the Record class should - # handle them (except verifying the CSV input format which is for Pandas) - if header: - df_CSV = pd.read_csv(file_name, delimiter=delimiter) - else: - df_CSV = pd.read_csv(file_name, delimiter=delimiter, header=None) - if verbose: - print("Successfully read CSV") - # Extract the entire signal from the dataframe - p_signal = df_CSV.values - # The dataframe should be in (`sig_len`, `n_sig`) dimensions - sig_len = p_signal.shape[0] - if verbose: - print("Signal length: {}".format(sig_len)) - n_sig = p_signal.shape[1] - if verbose: - print("Number of signals: {}".format(n_sig)) - # Check if signal names are valid and set defaults - if not sig_name: - if header: - sig_name = df_CSV.columns.to_list() - if any(map(str.isdigit, sig_name)): - print( - "WARNING: One or more of your signal names are numbers, this " - "is not recommended:\n- Does your CSV have a header line " - "which defines the signal names?\n- If not, please set the " - "parameter 'header' to False.\nSignal names: {}".format( - sig_name - ) - ) - else: - sig_name = ["ch_" + str(i) for i in range(n_sig)] - if verbose: - print("Signal names: {}".format(sig_name)) - - # Set the output header file name to be the same, remove path - if os.sep in file_name: - file_name = file_name.split(os.sep)[-1] - record_name = file_name.replace(".csv", "") - if verbose: - print("Output header: {}.hea".format(record_name)) - - # Replace the CSV file tag with DAT - dat_file_name = file_name.replace(".csv", ".dat") - dat_file_name = [dat_file_name] * n_sig - if verbose: - print("Output record: {}".format(dat_file_name[0])) - - # Convert `units` from string to list if necessary - units = [units] * n_sig if type(units) is str else units - - # Set the default `fmt` if none exists - if not fmt: - fmt = ["16"] * n_sig - fmt = [fmt] * n_sig if type(fmt) is str else fmt - if verbose: - print("Signal format: {}".format(fmt)) - - # Set the default `adc_gain` if none exists - if not adc_gain: - adc_gain = [200] * n_sig - adc_gain = [adc_gain] * n_sig if type(adc_gain) is int else adc_gain - if verbose: - print("Signal ADC gain: {}".format(adc_gain)) - - # Set the default `baseline` if none exists - if not baseline: - if adc_zero: - baseline = [adc_zero] * n_sig - else: - baseline = [0] * n_sig - baseline = [baseline] * n_sig if type(baseline) is int else baseline - if verbose: - print("Signal baseline: {}".format(baseline)) - - # Convert `samps_per_frame` from int to list if necessary - samps_per_frame = ( - [samps_per_frame] * n_sig - if type(samps_per_frame) is int - else samps_per_frame - ) - - # Convert `skew` from int to list if necessary - skew = [skew] * n_sig if type(skew) is int else skew - - # Convert `byte_offset` from int to list if necessary - byte_offset = ( - [byte_offset] * n_sig if type(byte_offset) is int else byte_offset - ) - - # Set the default `adc_res` if none exists - if not adc_res: - adc_res = [12] * n_sig - adc_res = [adc_res] * n_sig if type(adc_res) is int else adc_res - if verbose: - print("Signal ADC resolution: {}".format(adc_res)) - - # Set the default `adc_zero` if none exists - if not adc_zero: - adc_zero = [0] * n_sig - adc_zero = [adc_zero] * n_sig if type(adc_zero) is int else adc_zero - if verbose: - print("Signal ADC zero: {}".format(adc_zero)) - - # Set the default `init_value` - # NOTE: Initial value (and subsequently the digital signal) won't be correct - # unless the correct `baseline` and `adc_gain` are provided... this is just - # the best approximation - if not init_value: - init_value = p_signal[0, :] - init_value = baseline + (np.array(adc_gain) * init_value) - init_value = [int(i) for i in init_value.tolist()] - if verbose: - print("Signal initial value: {}".format(init_value)) - - # Set the default `checksum` - if not checksum: - checksum = [int(np.sum(v) % 65536) for v in np.transpose(p_signal)] - if verbose: - print("Signal checksum: {}".format(checksum)) - - # Set the default `block_size` - if not block_size: - block_size = [0] * n_sig - block_size = [block_size] * n_sig if type(block_size) is int else block_size - if verbose: - print("Signal block size: {}".format(block_size)) - - # Change the dates and times into `datetime` objects - if base_time: - base_time = _header.wfdb_strptime(base_time) - if base_date: - base_date = datetime.datetime.strptime(base_date, "%d/%m/%Y").date() - - # Convert array to floating point - p_signal = p_signal.astype("float64") - - # Either return the record or generate the record and header files - # if requested - if record_only: - # Create the record from the input and generated values - record = Record( - record_name=record_name, - n_sig=n_sig, - fs=fs, - samps_per_frame=samps_per_frame, - counter_freq=counter_freq, - base_counter=base_counter, - sig_len=sig_len, - base_time=base_time, - base_date=base_date, - comments=comments, - sig_name=sig_name, - p_signal=p_signal, - d_signal=None, - e_p_signal=None, - e_d_signal=None, - file_name=dat_file_name, - fmt=fmt, - skew=skew, - byte_offset=byte_offset, - adc_gain=adc_gain, - baseline=baseline, - units=units, - adc_res=adc_res, - adc_zero=adc_zero, - init_value=init_value, - checksum=checksum, - block_size=block_size, - ) - if verbose: - print("Record generated successfully") - return record - - else: - # Write the information to a record and header file - wrsamp( - record_name=record_name, - fs=fs, - units=units, - sig_name=sig_name, - p_signal=p_signal, - fmt=fmt, - adc_gain=adc_gain, - baseline=baseline, - comments=comments, - base_time=base_time, - base_date=base_date, - ) - if verbose: - print("File generated successfully") - - # ------------------------- Reading Records --------------------------- # @@ -3899,8 +1721,6 @@ def rdrecord( parameter is set, this parameter should contain just the base record name, and the files fill be searched for remotely. Otherwise, the data files will be searched for in the local path. - Can also handle .edf and .wav files as long as the extension is - provided in the `record_name`. sampfrom : int, optional The starting sample number to read for all channels. sampto : int, 'end', optional @@ -3987,12 +1807,7 @@ def rdrecord( dir_list[0], get_version(dir_list[0]), *dir_list[1:] ) - if record_name.endswith(".edf"): - record = edf2mit(record_name, pn_dir=pn_dir, record_only=True) - elif record_name.endswith(".wav"): - record = wav2mit(record_name, pn_dir=pn_dir, record_only=True) - else: - record = rdheader(record_name, pn_dir=pn_dir, rd_segments=False) + record = rdheader(record_name, pn_dir=pn_dir, rd_segments=False) # Set defaults for sampto and channels input variables if sampto is None: @@ -4073,12 +1888,8 @@ def rdrecord( # A single segment record elif isinstance(record, Record): - if record_name.endswith(".edf") or record_name.endswith(".wav"): - no_file = True - sig_data = record.d_signal - else: - no_file = False - sig_data = None + no_file = False + sig_data = None record.e_d_signal = _signal._rd_segment( file_name=record.file_name, @@ -4701,167 +2512,6 @@ def wfdbtime(record_name, input_times, pn_dir=None): print(f"{sample_num:>12}{out_time:>24}{out_date:>32}") -def sigavg( - record_name, - extension, - pn_dir=None, - return_df=False, - start_range=-0.05, - stop_range=0.05, - ann_type="all", - start_time=0, - stop_time=-1, - verbose=False, -): - """ - A common problem in signal processing is to determine the shape of a - recurring waveform in the presence of noise. If the waveform recurs - periodically (for example, once per second) the signal can be divided into - segments of an appropriate length (one second in this example), and the - segments can be averaged to reduce the amplitude of any noise that is - uncorrelated with the signal. Typically, noise is reduced by a factor of - the square root of the number of segments included in the average. For - physiologic signals, the waveforms of interest are usually not strictly - periodic, however. This function averages such waveforms by defining - segments (averaging windows) relative to the locations of waveform - annotations. By default, all QRS (beat) annotations for the specified - annotator are included. - - Parameters - ---------- - record_name : str - The name of the WFDB record to be read, without any file - extensions. If the argument contains any path delimiter - characters, the argument will be interpreted as PATH/BASE_RECORD. - Both relative and absolute paths are accepted. If the `pn_dir` - parameter is set, this parameter should contain just the base - record name, and the files fill be searched for remotely. - Otherwise, the data files will be searched for in the local path. - pn_dir : str, optional - Option used to stream data from Physionet. The Physionet - database directory from which to find the required record files. - eg. For record '100' in 'http://physionet.org/content/mitdb' - pn_dir='mitdb'. - return_df : bool, optional - Whether to return a Pandas dataframe (True) or just print the output - (False). - start_range : float, int, optional - Set the measurement window relative to QRS annotations. Negative - values correspond to offsets that precede the annotations. The default - is -0.05 seconds. - stop_range : float, int, optional - Set the measurement window relative to QRS annotations. Negative - values correspond to offsets that precede the annotations. The default - is 0.05 seconds. - ann_type : list[str], str, optional - Include annotations of the specified types only (i.e. 'N'). Multiple - types are also accepted (i.e. ['V','N']). The default is 'all' which - means to include all QRS annotations. - start_time : float, int, optional - Begin at the specified time in record. The default is 0 which denotes - the start of the record. - stop_time : float, int, optional - Process until the specified time in record. The default is -1 which - denotes the end of the record. - verbose : bool, optional - Whether to print the headers (True) or not (False). - - Returns - ------- - N/A : Pandas dataframe - If `return_df` is set to True, return a Pandas dataframe representing - the output from the original WFDB package. This is the same content as - if `return_df` were set to False, just in dataframe form. - - """ - if start_range >= stop_range: - raise Exception("`start_range` must be less than `stop_range`") - if start_time == stop_time: - raise Exception("`start_time` must be different than `stop_time`") - if (stop_time != -1) and (start_time >= stop_time): - raise Exception("`start_time` must be less than `stop_time`") - if start_time < 0: - raise Exception("`start_time` must be at least 0") - if (stop_time != -1) and (stop_time <= 0): - raise Exception("`stop_time` must be at least greater than 0") - - if (pn_dir is not None) and ("." not in pn_dir): - dir_list = pn_dir.split("/") - pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] - ) - - rec = rdrecord(record_name, pn_dir=pn_dir, physical=False) - ann = annotation.rdann(record_name, extension) - - if stop_time == -1: - stop_time = max(ann.sample) / ann.fs - samp_start = int(start_time * ann.fs) - samp_stop = int(stop_time * ann.fs) - filtered_samples = ann.sample[ - (ann.sample >= samp_start) & (ann.sample <= samp_stop) - ] - - times = np.arange( - int(start_range * rec.fs) / rec.fs, - int(-(-stop_range // (1 / rec.fs))) / rec.fs, - 1 / rec.fs, - ) - indices = np.rint(times * rec.fs).astype(np.int64) - - n_beats = 0 - initial_sig_avgs = np.zeros((times.shape[0], rec.n_sig)) - all_symbols = [a.symbol for a in annotation.ann_labels] - - for samp in filtered_samples: - samp_i = np.where(ann.sample == samp)[0][0] - current_ann = ann.symbol[samp_i] - if (ann_type != "all") and ( - ((type(ann_type) is str) and (current_ann != ann_type)) - or ((type(ann_type) is list) and (current_ann not in ann_type)) - ): - continue - try: - if not annotation.is_qrs[all_symbols.index(current_ann)]: - continue - except ValueError: - continue - - for c, i in enumerate(indices): - for j in range(rec.n_sig): - try: - initial_sig_avgs[c][j] += rec.d_signal[samp + i][j] - except IndexError: - initial_sig_avgs[c][j] += 0 - n_beats += 1 - - if n_beats < 1: - raise Exception("No beats found") - - if verbose and not return_df: - print(f"# Average of {n_beats} beats:") - s = "{:>14}" * rec.n_sig - print(f"# Time{s.format(*rec.sig_name)}") - print(f"# sec{s.format(*rec.units)}") - - final_sig_avgs = [] - for i, time in enumerate(times): - sig_avgs = [] - for j in range(rec.n_sig): - temp_sig_avg = initial_sig_avgs[i][j] / n_beats - temp_sig_avg -= rec.baseline[j] - temp_sig_avg /= rec.adc_gain[j] - sig_avgs.append(round(temp_sig_avg, 5)) - final_sig_avgs.append(sig_avgs) - - df = pd.DataFrame(final_sig_avgs, columns=rec.sig_name) - df.insert(0, "Time", np.around(times, decimals=5)) - if return_df: - return df - else: - print(df.to_string(index=False, header=False, col_space=13)) - - def _get_date_from_time(start_time, hours, minutes, seconds, out_time): """ Convert a starting time to a date using the elapsed time. @@ -4968,8 +2618,7 @@ def wrsamp( ---------- record_name : str The string name of the WFDB record to be written (without any file - extensions). Must not contain any "." since this would indicate an - EDF file which is not compatible at this point. + extensions). Must not contain any "." fs : int, float The sampling frequency of the record. units : list @@ -5202,20 +2851,14 @@ def dl_database( for rec in record_list: print("Generating record list for: " + rec) - # Check out whether each record is in MIT or EDF format - if rec.endswith(".edf"): - all_files.append(rec) + # May be pointing to directory + if rec.endswith(os.sep): + nested_records += [ + posixpath.join(rec, sr) + for sr in download.get_record_list(posixpath.join(db_dir, rec)) + ] else: - # May be pointing to directory - if rec.endswith(os.sep): - nested_records += [ - posixpath.join(rec, sr) - for sr in download.get_record_list( - posixpath.join(db_dir, rec) - ) - ] - else: - nested_records.append(rec) + nested_records.append(rec) for rec in nested_records: print("Generating list of all files for: " + rec) diff --git a/wfdb/processing/__init__.py b/wfdb/processing/__init__.py index c5b77249..657f9311 100644 --- a/wfdb/processing/__init__.py +++ b/wfdb/processing/__init__.py @@ -14,3 +14,4 @@ from wfdb.processing.hr import compute_hr, calc_rr, calc_mean_hr from wfdb.processing.peaks import find_peaks, find_local_peaks, correct_peaks from wfdb.processing.qrs import XQRS, xqrs_detect, gqrs_detect +from wfdb.processing.filter import sigavg diff --git a/wfdb/processing/filter.py b/wfdb/processing/filter.py new file mode 100644 index 00000000..795addfb --- /dev/null +++ b/wfdb/processing/filter.py @@ -0,0 +1,168 @@ +import posixpath + +import numpy as np +import pandas as pd + +from wfdb.io import annotation +from wfdb.io.record import get_version, rdrecord + + +def sigavg( + record_name, + extension, + pn_dir=None, + return_df=False, + start_range=-0.05, + stop_range=0.05, + ann_type="all", + start_time=0, + stop_time=-1, + verbose=False, +): + """ + A common problem in signal processing is to determine the shape of a + recurring waveform in the presence of noise. If the waveform recurs + periodically (for example, once per second) the signal can be divided into + segments of an appropriate length (one second in this example), and the + segments can be averaged to reduce the amplitude of any noise that is + uncorrelated with the signal. Typically, noise is reduced by a factor of + the square root of the number of segments included in the average. For + physiologic signals, the waveforms of interest are usually not strictly + periodic, however. This function averages such waveforms by defining + segments (averaging windows) relative to the locations of waveform + annotations. By default, all QRS (beat) annotations for the specified + annotator are included. + + Parameters + ---------- + record_name : str + The name of the WFDB record to be read, without any file + extensions. If the argument contains any path delimiter + characters, the argument will be interpreted as PATH/BASE_RECORD. + Both relative and absolute paths are accepted. If the `pn_dir` + parameter is set, this parameter should contain just the base + record name, and the files fill be searched for remotely. + Otherwise, the data files will be searched for in the local path. + pn_dir : str, optional + Option used to stream data from Physionet. The Physionet + database directory from which to find the required record files. + eg. For record '100' in 'http://physionet.org/content/mitdb' + pn_dir='mitdb'. + return_df : bool, optional + Whether to return a Pandas dataframe (True) or just print the output + (False). + start_range : float, int, optional + Set the measurement window relative to QRS annotations. Negative + values correspond to offsets that precede the annotations. The default + is -0.05 seconds. + stop_range : float, int, optional + Set the measurement window relative to QRS annotations. Negative + values correspond to offsets that precede the annotations. The default + is 0.05 seconds. + ann_type : list[str], str, optional + Include annotations of the specified types only (i.e. 'N'). Multiple + types are also accepted (i.e. ['V','N']). The default is 'all' which + means to include all QRS annotations. + start_time : float, int, optional + Begin at the specified time in record. The default is 0 which denotes + the start of the record. + stop_time : float, int, optional + Process until the specified time in record. The default is -1 which + denotes the end of the record. + verbose : bool, optional + Whether to print the headers (True) or not (False). + + Returns + ------- + N/A : Pandas dataframe + If `return_df` is set to True, return a Pandas dataframe representing + the output from the original WFDB package. This is the same content as + if `return_df` were set to False, just in dataframe form. + + """ + if start_range >= stop_range: + raise Exception("`start_range` must be less than `stop_range`") + if start_time == stop_time: + raise Exception("`start_time` must be different than `stop_time`") + if (stop_time != -1) and (start_time >= stop_time): + raise Exception("`start_time` must be less than `stop_time`") + if start_time < 0: + raise Exception("`start_time` must be at least 0") + if (stop_time != -1) and (stop_time <= 0): + raise Exception("`stop_time` must be at least greater than 0") + + if (pn_dir is not None) and ("." not in pn_dir): + dir_list = pn_dir.split("/") + pn_dir = posixpath.join( + dir_list[0], get_version(dir_list[0]), *dir_list[1:] + ) + + rec = rdrecord(record_name, pn_dir=pn_dir, physical=False) + ann = annotation.rdann(record_name, extension) + + if stop_time == -1: + stop_time = max(ann.sample) / ann.fs + samp_start = int(start_time * ann.fs) + samp_stop = int(stop_time * ann.fs) + filtered_samples = ann.sample[ + (ann.sample >= samp_start) & (ann.sample <= samp_stop) + ] + + times = np.arange( + int(start_range * rec.fs) / rec.fs, + int(-(-stop_range // (1 / rec.fs))) / rec.fs, + 1 / rec.fs, + ) + indices = np.rint(times * rec.fs).astype(np.int64) + + n_beats = 0 + initial_sig_avgs = np.zeros((times.shape[0], rec.n_sig)) + all_symbols = [a.symbol for a in annotation.ann_labels] + + for samp in filtered_samples: + samp_i = np.where(ann.sample == samp)[0][0] + current_ann = ann.symbol[samp_i] + if (ann_type != "all") and ( + ((type(ann_type) is str) and (current_ann != ann_type)) + or ((type(ann_type) is list) and (current_ann not in ann_type)) + ): + continue + try: + if not annotation.is_qrs[all_symbols.index(current_ann)]: + continue + except ValueError: + continue + + for c, i in enumerate(indices): + for j in range(rec.n_sig): + try: + initial_sig_avgs[c][j] += rec.d_signal[samp + i][j] + except IndexError: + initial_sig_avgs[c][j] += 0 + n_beats += 1 + + if n_beats < 1: + raise Exception("No beats found") + + if verbose and not return_df: + print(f"# Average of {n_beats} beats:") + s = "{:>14}" * rec.n_sig + print(f"# Time{s.format(*rec.sig_name)}") + print(f"# sec{s.format(*rec.units)}") + + final_sig_avgs = [] + for i, time in enumerate(times): + sig_avgs = [] + for j in range(rec.n_sig): + temp_sig_avg = initial_sig_avgs[i][j] / n_beats + temp_sig_avg -= rec.baseline[j] + temp_sig_avg /= rec.adc_gain[j] + sig_avgs.append(round(temp_sig_avg, 5)) + final_sig_avgs.append(sig_avgs) + + df = pd.DataFrame(final_sig_avgs, columns=rec.sig_name) + df.insert(0, "Time", np.around(times, decimals=5)) + if return_df: + return df + else: + print(df.to_string(index=False, header=False, col_space=13)) From 3a35ef69918e5cbdf5dd0c3ea74410576cc1e5f9 Mon Sep 17 00:00:00 2001 From: Chen Xie Date: Tue, 3 May 2022 22:10:10 -0700 Subject: [PATCH 2/3] Move more content and remove cyclic dependencies --- tests/io/test_convert.py | 8 +- wfdb/__init__.py | 6 +- wfdb/io/__init__.py | 12 +- wfdb/io/_header.py | 1 - wfdb/io/_signal.py | 1 - wfdb/io/annotation.py | 579 +----------------------------------- wfdb/io/convert/__init__.py | 5 + wfdb/io/convert/csv.py | 210 +++++++++++++ wfdb/io/convert/edf.py | 217 +++++++++++++- wfdb/io/convert/wav.py | 4 +- wfdb/io/download.py | 41 ++- wfdb/io/record.py | 46 +-- wfdb/processing/__init__.py | 2 +- wfdb/processing/filter.py | 5 +- wfdb/processing/hr.py | 156 ++++++++++ wfdb/processing/peaks.py | 1 - wfdb/processing/qrs.py | 1 - 17 files changed, 645 insertions(+), 650 deletions(-) diff --git a/tests/io/test_convert.py b/tests/io/test_convert.py index 52b21da5..aa7ba78a 100644 --- a/tests/io/test_convert.py +++ b/tests/io/test_convert.py @@ -1,7 +1,7 @@ import numpy as np -import wfdb -from wfdb.io import read_edf +from wfdb.io.record import rdrecord +from wfdb.io.convert.edf import read_edf class TestConvert: @@ -11,7 +11,7 @@ def test_edf_uniform(self): """ # Uniform sample rates - record_MIT = wfdb.rdrecord("sample-data/n16").__dict__ + record_MIT = rdrecord("sample-data/n16").__dict__ record_EDF = read_edf("sample-data/n16.edf").__dict__ fields = list(record_MIT.keys()) @@ -63,7 +63,7 @@ def test_edf_non_uniform(self): """ # Non-uniform sample rates - record_MIT = wfdb.rdrecord("sample-data/wave_4").__dict__ + record_MIT = rdrecord("sample-data/wave_4").__dict__ record_EDF = read_edf("sample-data/wave_4.edf").__dict__ fields = list(record_MIT.keys()) diff --git a/wfdb/__init__.py b/wfdb/__init__.py index aaa53499..89cdcbee 100644 --- a/wfdb/__init__.py +++ b/wfdb/__init__.py @@ -17,16 +17,12 @@ wrann, show_ann_labels, show_ann_classes, - ann2rr, - rr2ann, - csv2ann, - rdedfann, mrgann, ) from wfdb.io.download import ( + dl_files, get_dbs, get_record_list, - dl_files, set_db_index_url, ) from wfdb.plot.plot import plot_items, plot_wfdb, plot_all_records diff --git a/wfdb/io/__init__.py b/wfdb/io/__init__.py index d49665b5..6c2ac908 100644 --- a/wfdb/io/__init__.py +++ b/wfdb/io/__init__.py @@ -19,20 +19,12 @@ wrann, show_ann_labels, show_ann_classes, - ann2rr, - rr2ann, - csv2ann, - rdedfann, mrgann, ) from wfdb.io.download import ( + dl_files, get_dbs, get_record_list, - dl_files, set_db_index_url, ) -from wfdb.io.convert.csv import csv_to_wfdb -from wfdb.io.convert.edf import read_edf, wfdb_to_edf -from wfdb.io.convert.matlab import wfdb_to_mat -from wfdb.io.convert.tff import rdtff -from wfdb.io.convert.wav import wfdb_to_wav, read_wav + diff --git a/wfdb/io/_header.py b/wfdb/io/_header.py index 83c9c900..96f56f90 100644 --- a/wfdb/io/_header.py +++ b/wfdb/io/_header.py @@ -1,7 +1,6 @@ import datetime import os import re -import pdb import numpy as np import pandas as pd diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py index 405a4031..e4f3a1b0 100644 --- a/wfdb/io/_signal.py +++ b/wfdb/io/_signal.py @@ -5,7 +5,6 @@ import numpy as np from wfdb.io import download -import pdb MAX_I32 = 2147483647 diff --git a/wfdb/io/annotation.py b/wfdb/io/annotation.py index 5be014c0..e1b206a1 100644 --- a/wfdb/io/annotation.py +++ b/wfdb/io/annotation.py @@ -4,13 +4,12 @@ import pandas as pd import re import posixpath -import struct import sys from wfdb.io import download from wfdb.io import _header from wfdb.io import record -from wfdb.io.convert.edf import read_edf + class Annotation(object): """ @@ -1928,7 +1927,7 @@ def rdann( if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], record.get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) return_label_elements = check_read_inputs( @@ -2561,578 +2560,6 @@ def rm_last(*args): return -def ann2rr( - record_name, - extension, - pn_dir=None, - start_time=None, - stop_time=None, - format=None, - as_array=True, -): - from wfdb.processing import hr - - """ - Obtain RR interval series from ECG annotation files. - - Parameters - ---------- - record_name : str - The record name of the WFDB annotation file. ie. for file '100.atr', - record_name='100'. - extension : str - The annotatator extension of the annotation file. ie. for file - '100.atr', extension='atr'. - pn_dir : str, optional - Option used to stream data from Physionet. The PhysioNet database - directory from which to find the required annotation file. eg. For - record '100' in 'http://physionet.org/content/mitdb': pn_dir='mitdb'. - start_time : float, optional - The time to start the intervals in seconds. - stop_time : float, optional - The time to stop the intervals in seconds. - format : str, optional - Print intervals in the specified format. By default, intervals are - printed in units of sample intervals. Other formats include - 's' (seconds), 'm' (minutes), 'h' (hours). Set to 'None' for samples. - as_array : bool, optional - If True, return an an 'ndarray', else print the output. - - Returns - ------- - N/A - - Examples - -------- - >>> wfdb.ann2rr('sample-data/100', 'atr', as_array=False) - >>> 18 - >>> 59 - >>> ... - >>> 250 - >>> 257 - - """ - if (pn_dir is not None) and ("." not in pn_dir): - dir_list = pn_dir.split("/") - pn_dir = posixpath.join( - dir_list[0], record.get_version(dir_list[0]), *dir_list[1:] - ) - - ann = rdann(record_name, extension, pn_dir=pn_dir) - - rr_interval = hr.calc_rr(ann.sample, fs=ann.fs) - rr_interval = np.insert(rr_interval, 0, ann.sample[0]) - - time_interval = rr_interval / ann.fs - if start_time is not None: - time_interval = time_interval[(time_interval > start_time).astype(bool)] - if stop_time is not None: - time_interval = time_interval[(time_interval < stop_time).astype(bool)] - - # Already given in seconds (format == 's') - if format == "s": - out_interval = time_interval - elif format == "m": - out_interval = time_interval / 60 - elif format == "h": - out_interval = time_interval / (60 * 60) - else: - out_interval = np.around(time_interval * ann.fs).astype(np.int) - - if as_array: - return out_interval - else: - print(*out_interval, sep="\n") - - -def rr2ann(rr_array, record_name, extension, fs=250, as_time=False): - """ - Creates an annotation file from the standard input, which should usually - be a Numpy array of intervals in the format produced by `ann2rr`. (For - exceptions, see the `as_time` parameter below.). An optional second column - may be provided which gives the respective annotation mnemonic. - - Parameters - ---------- - rr_array : ndarray - A Numpy array consisting of the input RR intervals. If `as_time` is - set to True, then the input should consist of times of occurences. If, - the shape of the input array is '(n_annot,2)', then treat the second - column as the annotation mnemonic ('N', 'V', etc.). If a second column - is not specified, then the default annotation will the '"' which - specifies a comment. - record_name : str - The record name of the WFDB annotation file. ie. for file '100.atr', - record_name='100'. - extension : str - The annotatator extension of the annotation file. ie. for file - '100.atr', extension='atr'. - fs : float, int, optional - Assume the specified sampling frequency. This option has no effect - unless the `as_time` parameter is set to convert to samples; in this - case, a sampling frequency of 250 Hz is assumed if this option is - omitted. - as_time : bool - Interpret the input as times of occurrence (if True), rather than as - samples (if False). There is not currently a way to input RR intervals - in time format between beats. For example, 0.2 seconds between beats - 1->2, 0.3 seconds between beats 2->3, etc. - - Returns - ------- - N/A - - Examples - -------- - Using time of occurence as input: - >>> import numpy as np - >>> rr_array = np.array([[0.2, 0.6, 1.3], ['V', 'N', 'V']]).T - >>> wfdb.rr2ann(rr_array, 'test_ann', 'atr', fs=100, as_time=True) - - Using samples as input: - >>> import numpy as np - >>> rr_array = np.array([4, 17, 18, 16]) - >>> wfdb.rr2ann(rr_array, 'test_ann', 'atr') - - """ - try: - ann_sample = rr_array[:, 0] - except IndexError: - ann_sample = rr_array - - if as_time: - ann_sample = (fs * ann_sample.astype(np.float64)).astype(np.int64) - else: - ann_sample = np.cumsum(ann_sample).astype(np.int64) - - try: - ann_symbol = rr_array[:, 1].tolist() - except IndexError: - ann_symbol = rr_array.shape[0] * ['"'] - - wrann(record_name, extension, ann_sample, symbol=ann_symbol) - - -def csv2ann( - file_name, - extension="atr", - fs=None, - record_only=False, - time_onset=True, - header=True, - delimiter=",", - verbose=False, -): - """ - Read a CSV/TSV/etc. file and return either an `Annotation` object with the - annotation descriptors as attributes or write an annotation file. - - Parameters - ---------- - file_name : str - The name of the CSV file to be read, including the '.csv' file - extension. If the argument contains any path delimiter characters, the - argument will be interpreted as PATH/BASE_RECORD. Both relative and - absolute paths are accepted. The BASE_RECORD file name will be used to - name the annotation file with the desired extension. - extension : str, optional - The string annotation file extension. - fs : float, optional - This will be used if annotation onsets are given in the format of time - (`time_onset` = True) instead of sample since onsets must be sample - numbers in order for `wrann` to work. This number can be expressed in - any format legal for a Python input of floating point numbers (thus - '360', '360.', '360.0', and '3.6e2' are all legal and equivalent). The - sampling frequency must be greater than 0; if it is missing, a value - of 250 is assumed. - record_only : bool, optional - Whether to only return the record information (True) or not (False). - If false, this function will generate the annotation file. - time_onset : bool, optional - Whether to assume the values provided in the 'onset' column are in - units of time (True) or samples (False). If True, convert the onset - times to samples by using the, now required, `fs` input. - header : bool, optional - Whether to assume the CSV has a first line header (True) or not - (False) which defines the signal names. - delimiter : str, optional - What to use as the delimiter for the file to separate data. The default - if a comma (','). Other common delimiters are tabs ('\t'), spaces (' '), - pipes ('|'), and colons (':'). - verbose : bool, optional - Whether to print all the information read about the file (True) or - not (False). - - Returns - ------- - N/A : Annotation, optional - The WFDB Annotation object representing the contents of the CSV file - read. - - Notes - ----- - CSVs should be in one of the two possible following format: - - 1) All events are single time events (no duration). - - onset,description - onset_1,description_1 - onset_2,description_2 - ...,... - - Or this format if `header=False` is defined: - - onset_1,description_1 - onset_2,description_2 - ...,... - - 2) A duration is specified for some events. - - onset,duration,description - onset_1,duration_1,description_1 - onset_2,duration_2,description_2 - ...,...,... - - Or this format if `header=False` is defined: - - onset_1,duration_1,description_1 - onset_2,duration_2,description_2 - ...,...,... - - By default, the 'onset' will be interpreted as a sample number if it is - strictly in integer format and as a time otherwise. By default, the - 'duration' will be interpreted as time values and not elapsed samples. By - default, the 'description' will be interpreted as the `aux_note` for the - annotation and the `symbol` will automatically be set to " which defines a - comment. Future additions will allow the user to customize such - attributes. - - Examples - -------- - 1) Write WFDB annotation file from CSV with time onsets: - ======= start example.csv ======= - onset,description - 0.2,p-wave - 0.8,qrs - ======== end example.csv ======== - >>> wfdb.csv2ann('example.csv', fs=360) - * Creates a WFDB annotation file called: 'example.atr' - - 2) Write WFDB annotation file from CSV with sample onsets: - ======= start example.csv ======= - onset,description - 5,p-wave - 13,qrs - ======== end example.csv ======== - >>> wfdb.csv2ann('example.csv', fs=10, time_onset=False) - * Creates a WFDB annotation file called: 'example.atr' - * 5,13 samples -> 0.5,1.3 seconds for onset - - 3) Write WFDB annotation file from CSV with time onsets, durations, and no - header: - ======= start example.csv ======= - 0.2,0.1,qrs - 0.8,0.4,qrs - ======== end example.csv ======== - >>> wfdb.csv2ann('example.csv', extension='qrs', fs=360, header=False) - * Creates a WFDB annotation file called: 'example.qrs' - - """ - # NOTE: No need to write input checks here since the Annotation class - # should handle them (except verifying the CSV input format which is for - # Pandas) - if header: - df_CSV = pd.read_csv(file_name, delimiter=delimiter) - else: - df_CSV = pd.read_csv(file_name, delimiter=delimiter, header=None) - if verbose: - print("Successfully read CSV") - - if verbose: - print("Creating Pandas dataframe from CSV") - if df_CSV.shape[1] == 2: - if verbose: - print("onset,description format detected") - if not header: - df_CSV.columns = ["onset", "description"] - df_out = df_CSV - elif df_CSV.shape[1] == 3: - if verbose: - print("onset,duration,description format detected") - print("Converting durations to single time-point events") - if not header: - df_CSV.columns = ["onset", "duration", "description"] - df_out = _format_ann_from_df(df_CSV) - else: - raise Exception( - """The number of columns in the CSV was not - recognized.""" - ) - - # Remove extension from input file name - file_name = file_name.split(".")[0] - if time_onset: - if not fs: - raise Exception( - """`fs` must be provided if `time_onset` is True - since it is required to convert time onsets to - samples""" - ) - sample = (df_out["onset"].to_numpy() * fs).astype(np.int64) - else: - sample = df_out["onset"].to_numpy() - # Assume each annotation is a comment - symbol = ['"'] * len(df_out.index) - subtype = np.array([22] * len(df_out.index)) - # Assume each annotation belongs with the 1st channel - chan = np.array([0] * len(df_out.index)) - num = np.array([0] * len(df_out.index)) - aux_note = df_out["description"].tolist() - - if verbose: - print("Finished CSV parsing... writing to Annotation object") - - if record_only: - return Annotation( - record_name=file_name, - extension=extension, - sample=sample, - symbol=symbol, - subtype=subtype, - chan=chan, - num=num, - aux_note=aux_note, - fs=fs, - ) - if verbose: - print("Finished creating Annotation object") - else: - wrann( - file_name, - extension, - sample=sample, - symbol=symbol, - subtype=subtype, - chan=chan, - num=num, - aux_note=aux_note, - fs=fs, - ) - if verbose: - print("Finished writing Annotation file") - - -def rdedfann( - record_name, - pn_dir=None, - delete_file=True, - info_only=True, - record_only=False, - verbose=False, -): - """ - This program returns the annotation information from an EDF+ file - containing annotations (with the signal name given as 'EDF Annotations'). - The information that is returned if `info_only` is set to True is: - { - 'onset_time': list of %H:%M:%S.fff strings denoting the annotation - onset times, - 'sample_num': list of integers denoting the annotation onset - sample numbers, - 'comment': list of comments (`aux_note`) for the annotations, - 'duration': list of floats denoting the duration of the event - } - Else, this function will return either the WFDB Annotation format of the - information of the file if `record_only` is set to True, or nothing if - neither are set to True though a WFDB Annotation file will be created. - - Parameters - ---------- - record_name : str - The name of the input EDF record to be read. - pn_dir : str, optional - Option used to stream data from Physionet. The Physionet - database directory from which to find the required record files. - eg. For record '100' in 'http://physionet.org/content/mitdb' - pn_dir='mitdb'. - delete_file : bool, optional - Whether to delete the saved EDF file (False) or not (True) - after being imported. - info_only : bool, optional - Return, strictly, the information contained in the file as formatted - by the original WFDB package. Must not be True if `record_only` is - True. - record_only : bool, optional - Whether to only return the annotation information (True) or not - (False). If False, this function will generate a WFDB-formatted - annotation file. If True, it will return the object returned if that - file were read with `rdann`. Must not be True if `info_only` is True. - verbose : bool, optional - Whether to print all the information read about the file (True) or - not (False). - - Returns - ------- - N/A : dict, Annotation, optional - If 'info_only' is set to True, return all of the annotation - information needed to generate WFDB-formatted annotation files. - If 'record_only' is set to True, return the WFDB-formatted annotation - object generated by the `rdann` output. If none are set to True, write - the WFDB-formatted annotation file. - - Notes - ----- - The entire file is composed of (seen here: - https://www.edfplus.info/specs/edfplus.html#edfplusannotations): - - HEADER RECORD (we suggest to also adopt the 12 simple additional EDF+ specs) - 8 ascii : version of this data format (0) - 80 ascii : local patient identification (mind item 3 of the additional EDF+ specs) - 80 ascii : local recording identification (mind item 4 of the additional EDF+ specs) - 8 ascii : startdate of recording (dd.mm.yy) (mind item 2 of the additional EDF+ specs) - 8 ascii : starttime of recording (hh.mm.ss) - 8 ascii : number of bytes in header record - 44 ascii : reserved - 8 ascii : number of data records (-1 if unknown, obey item 10 of the additional EDF+ specs) - 8 ascii : duration of a data record, in seconds - 4 ascii : number of signals (ns) in data record - ns * 16 ascii : ns * label (must be 'EDF Annotations') - ns * 80 ascii : ns * transducer type (must be whitespace) - ns * 8 ascii : ns * physical dimension (must be whitespace) - ns * 8 ascii : ns * physical minimum (e.g. -500 or 34, different than physical maximum) - ns * 8 ascii : ns * physical maximum (e.g. 500 or 40, different than physical minimum) - ns * 8 ascii : ns * digital minimum (must be -32768) - ns * 8 ascii : ns * digital maximum (must be 32767) - ns * 80 ascii : ns * prefiltering (must be whitespace) - ns * 8 ascii : ns * nr of samples in each data record - ns * 32 ascii : ns * reserved - - ANNOTATION RECORD - - Examples - -------- - >>> ann_info = wfdb.rdedfann('sample-data/test_edfann.edf') - - """ - # Some preliminary checks - if info_only and record_only: - raise Exception( - "Both `info_only` and `record_only` are set. Only one " - "can be set at a time." - ) - - # According to the EDF+ docs: - # "The coding is EDF compatible in the sense that old EDF software would - # simply treat this 'EDF Annotations' signal as if it were a (strange- - # looking) ordinary signal" - rec = read_edf( - record_name, - pn_dir=pn_dir, - delete_file=delete_file, - record_only=True, - rdedfann_flag=True, - ) - - # Convert from array of integers to ASCII strings - annotation_string = "" - for chunk in rec.p_signal.flatten().astype(np.int64): - if chunk + 1 == 0: - continue - else: - adjusted_hex = hex( - struct.unpack("H", chunk + 1))[0] - ) - annotation_string += bytes.fromhex(adjusted_hex[2:]).decode("ascii") - # Remove all of the whitespace - for rep in ["\x00", "\x14", "\x15"]: - annotation_string = annotation_string.replace(rep, " ") - - # Parse the resulting annotation string - onsets = [] - onset_times = [] - sample_nums = [] - comments = [] - durations = [] - all_anns = annotation_string.split("+") - for ann in all_anns: - if ann == "": - continue - try: - ann_split = ann.strip().split(" ") - onset = float(ann_split[0]) - hours, rem = divmod(onset, 3600) - minutes, seconds = divmod(rem, 60) - onset_time = f"{hours:02.0f}:{minutes:02.0f}:{seconds:06.3f}" - sample_num = int(onset * rec.sig_len) - duration = float(ann_split[1]) - comment = " ".join(ann_split[2:]) - if verbose: - print( - f"{onset_time}\t{sample_num}\t{comment}\t\tduration: {duration}" - ) - onsets.append(onset) - onset_times.append(onset_time) - sample_nums.append(sample_num) - comments.append(comment) - durations.append(duration) - except IndexError: - continue - - if info_only: - return { - "onset_time": onset_times, - "sample_num": sample_nums, - "comment": comments, - "duration": durations, - } - else: - df_in = pd.DataFrame( - data={ - "onset": onsets, - "duration": durations, - "description": comments, - } - ) - df_out = _format_ann_from_df(df_in) - # Remove extension from input file name - record_name = record_name.split(os.sep)[-1].split(".")[0] - extension = "atr" - fs = rec.fs - sample = (df_out["onset"].to_numpy() * fs).astype(np.int64) - # Assume each annotation is a comment - symbol = ['"'] * len(df_out.index) - subtype = np.array([22] * len(df_out.index)) - # Assume each annotation belongs with the 1st channel - chan = np.array([0] * len(df_out.index)) - num = np.array([0] * len(df_out.index)) - aux_note = df_out["description"].tolist() - - if record_only: - return Annotation( - record_name=record_name, - extension=extension, - sample=sample, - symbol=symbol, - subtype=subtype, - chan=chan, - num=num, - aux_note=aux_note, - fs=fs, - ) - else: - wrann( - record_name, - extension, - sample=sample, - symbol=symbol, - subtype=subtype, - chan=chan, - num=num, - aux_note=aux_note, - fs=fs, - ) - - def mrgann( ann_file1, ann_file2, @@ -3471,7 +2898,7 @@ def mrgann( ) -def _format_ann_from_df(df_in): +def format_ann_from_df(df_in): """ Parameters ---------- diff --git a/wfdb/io/convert/__init__.py b/wfdb/io/convert/__init__.py index e69de29b..54cd440e 100644 --- a/wfdb/io/convert/__init__.py +++ b/wfdb/io/convert/__init__.py @@ -0,0 +1,5 @@ +from wfdb.io.convert.csv import csv2ann, csv_to_wfdb +from wfdb.io.convert.edf import rdedfann, read_edf, wfdb_to_edf +from wfdb.io.convert.matlab import wfdb_to_mat +from wfdb.io.convert.tff import rdtff +from wfdb.io.convert.wav import wfdb_to_wav, read_wav diff --git a/wfdb/io/convert/csv.py b/wfdb/io/convert/csv.py index 21ae29c7..cd8134d7 100644 --- a/wfdb/io/convert/csv.py +++ b/wfdb/io/convert/csv.py @@ -5,6 +5,7 @@ import pandas as pd from wfdb.io import _header +from wfdb.io.annotation import format_ann_from_df, Annotation, wrann from wfdb.io.record import Record, wrsamp @@ -478,3 +479,212 @@ def csv_to_wfdb( ) if verbose: print("File generated successfully") + + +def csv2ann( + file_name, + extension="atr", + fs=None, + record_only=False, + time_onset=True, + header=True, + delimiter=",", + verbose=False, +): + """ + Read a CSV/TSV/etc. file and return either an `Annotation` object with the + annotation descriptors as attributes or write an annotation file. + + Parameters + ---------- + file_name : str + The name of the CSV file to be read, including the '.csv' file + extension. If the argument contains any path delimiter characters, the + argument will be interpreted as PATH/BASE_RECORD. Both relative and + absolute paths are accepted. The BASE_RECORD file name will be used to + name the annotation file with the desired extension. + extension : str, optional + The string annotation file extension. + fs : float, optional + This will be used if annotation onsets are given in the format of time + (`time_onset` = True) instead of sample since onsets must be sample + numbers in order for `wrann` to work. This number can be expressed in + any format legal for a Python input of floating point numbers (thus + '360', '360.', '360.0', and '3.6e2' are all legal and equivalent). The + sampling frequency must be greater than 0; if it is missing, a value + of 250 is assumed. + record_only : bool, optional + Whether to only return the record information (True) or not (False). + If false, this function will generate the annotation file. + time_onset : bool, optional + Whether to assume the values provided in the 'onset' column are in + units of time (True) or samples (False). If True, convert the onset + times to samples by using the, now required, `fs` input. + header : bool, optional + Whether to assume the CSV has a first line header (True) or not + (False) which defines the signal names. + delimiter : str, optional + What to use as the delimiter for the file to separate data. The default + if a comma (','). Other common delimiters are tabs ('\t'), spaces (' '), + pipes ('|'), and colons (':'). + verbose : bool, optional + Whether to print all the information read about the file (True) or + not (False). + + Returns + ------- + N/A : Annotation, optional + The WFDB Annotation object representing the contents of the CSV file + read. + + Notes + ----- + CSVs should be in one of the two possible following format: + + 1) All events are single time events (no duration). + + onset,description + onset_1,description_1 + onset_2,description_2 + ...,... + + Or this format if `header=False` is defined: + + onset_1,description_1 + onset_2,description_2 + ...,... + + 2) A duration is specified for some events. + + onset,duration,description + onset_1,duration_1,description_1 + onset_2,duration_2,description_2 + ...,...,... + + Or this format if `header=False` is defined: + + onset_1,duration_1,description_1 + onset_2,duration_2,description_2 + ...,...,... + + By default, the 'onset' will be interpreted as a sample number if it is + strictly in integer format and as a time otherwise. By default, the + 'duration' will be interpreted as time values and not elapsed samples. By + default, the 'description' will be interpreted as the `aux_note` for the + annotation and the `symbol` will automatically be set to " which defines a + comment. Future additions will allow the user to customize such + attributes. + + Examples + -------- + 1) Write WFDB annotation file from CSV with time onsets: + ======= start example.csv ======= + onset,description + 0.2,p-wave + 0.8,qrs + ======== end example.csv ======== + >>> wfdb.csv2ann('example.csv', fs=360) + * Creates a WFDB annotation file called: 'example.atr' + + 2) Write WFDB annotation file from CSV with sample onsets: + ======= start example.csv ======= + onset,description + 5,p-wave + 13,qrs + ======== end example.csv ======== + >>> wfdb.csv2ann('example.csv', fs=10, time_onset=False) + * Creates a WFDB annotation file called: 'example.atr' + * 5,13 samples -> 0.5,1.3 seconds for onset + + 3) Write WFDB annotation file from CSV with time onsets, durations, and no + header: + ======= start example.csv ======= + 0.2,0.1,qrs + 0.8,0.4,qrs + ======== end example.csv ======== + >>> wfdb.csv2ann('example.csv', extension='qrs', fs=360, header=False) + * Creates a WFDB annotation file called: 'example.qrs' + + """ + # NOTE: No need to write input checks here since the Annotation class + # should handle them (except verifying the CSV input format which is for + # Pandas) + if header: + df_CSV = pd.read_csv(file_name, delimiter=delimiter) + else: + df_CSV = pd.read_csv(file_name, delimiter=delimiter, header=None) + if verbose: + print("Successfully read CSV") + + if verbose: + print("Creating Pandas dataframe from CSV") + if df_CSV.shape[1] == 2: + if verbose: + print("onset,description format detected") + if not header: + df_CSV.columns = ["onset", "description"] + df_out = df_CSV + elif df_CSV.shape[1] == 3: + if verbose: + print("onset,duration,description format detected") + print("Converting durations to single time-point events") + if not header: + df_CSV.columns = ["onset", "duration", "description"] + df_out = format_ann_from_df(df_CSV) + else: + raise Exception( + """The number of columns in the CSV was not + recognized.""" + ) + + # Remove extension from input file name + file_name = file_name.split(".")[0] + if time_onset: + if not fs: + raise Exception( + """`fs` must be provided if `time_onset` is True + since it is required to convert time onsets to + samples""" + ) + sample = (df_out["onset"].to_numpy() * fs).astype(np.int64) + else: + sample = df_out["onset"].to_numpy() + # Assume each annotation is a comment + symbol = ['"'] * len(df_out.index) + subtype = np.array([22] * len(df_out.index)) + # Assume each annotation belongs with the 1st channel + chan = np.array([0] * len(df_out.index)) + num = np.array([0] * len(df_out.index)) + aux_note = df_out["description"].tolist() + + if verbose: + print("Finished CSV parsing... writing to Annotation object") + + if record_only: + if verbose: + print("Finished creating Annotation object") + return Annotation( + record_name=file_name, + extension=extension, + sample=sample, + symbol=symbol, + subtype=subtype, + chan=chan, + num=num, + aux_note=aux_note, + fs=fs, + ) + else: + wrann( + file_name, + extension, + sample=sample, + symbol=symbol, + subtype=subtype, + chan=chan, + num=num, + aux_note=aux_note, + fs=fs, + ) + if verbose: + print("Finished writing Annotation file") diff --git a/wfdb/io/convert/edf.py b/wfdb/io/convert/edf.py index 17ae8b68..76c05697 100644 --- a/wfdb/io/convert/edf.py +++ b/wfdb/io/convert/edf.py @@ -6,8 +6,10 @@ import struct import numpy as np +import pandas as pd -from wfdb.io.record import Record, rdrecord, SIG_UNITS, get_version +from wfdb.io.annotation import Annotation, format_ann_from_df, wrann +from wfdb.io.record import Record, rdrecord, SIG_UNITS from wfdb.io import _url from wfdb.io import download @@ -124,7 +126,7 @@ def read_edf( if "." not in pn_dir: dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) file_url = posixpath.join(download.PN_INDEX_URL, pn_dir, record_name) @@ -986,3 +988,214 @@ def wfdb_to_edf( val, i ) ) + + +def rdedfann( + record_name, + pn_dir=None, + delete_file=True, + info_only=True, + record_only=False, + verbose=False, +): + """ + This program returns the annotation information from an EDF+ file + containing annotations (with the signal name given as 'EDF Annotations'). + The information that is returned if `info_only` is set to True is: + { + 'onset_time': list of %H:%M:%S.fff strings denoting the annotation + onset times, + 'sample_num': list of integers denoting the annotation onset + sample numbers, + 'comment': list of comments (`aux_note`) for the annotations, + 'duration': list of floats denoting the duration of the event + } + Else, this function will return either the WFDB Annotation format of the + information of the file if `record_only` is set to True, or nothing if + neither are set to True though a WFDB Annotation file will be created. + + Parameters + ---------- + record_name : str + The name of the input EDF record to be read. + pn_dir : str, optional + Option used to stream data from Physionet. The Physionet + database directory from which to find the required record files. + eg. For record '100' in 'http://physionet.org/content/mitdb' + pn_dir='mitdb'. + delete_file : bool, optional + Whether to delete the saved EDF file (False) or not (True) + after being imported. + info_only : bool, optional + Return, strictly, the information contained in the file as formatted + by the original WFDB package. Must not be True if `record_only` is + True. + record_only : bool, optional + Whether to only return the annotation information (True) or not + (False). If False, this function will generate a WFDB-formatted + annotation file. If True, it will return the object returned if that + file were read with `rdann`. Must not be True if `info_only` is True. + verbose : bool, optional + Whether to print all the information read about the file (True) or + not (False). + + Returns + ------- + N/A : dict, Annotation, optional + If 'info_only' is set to True, return all of the annotation + information needed to generate WFDB-formatted annotation files. + If 'record_only' is set to True, return the WFDB-formatted annotation + object generated by the `rdann` output. If none are set to True, write + the WFDB-formatted annotation file. + + Notes + ----- + The entire file is composed of (seen here: + https://www.edfplus.info/specs/edfplus.html#edfplusannotations): + + HEADER RECORD (we suggest to also adopt the 12 simple additional EDF+ specs) + 8 ascii : version of this data format (0) + 80 ascii : local patient identification (mind item 3 of the additional EDF+ specs) + 80 ascii : local recording identification (mind item 4 of the additional EDF+ specs) + 8 ascii : startdate of recording (dd.mm.yy) (mind item 2 of the additional EDF+ specs) + 8 ascii : starttime of recording (hh.mm.ss) + 8 ascii : number of bytes in header record + 44 ascii : reserved + 8 ascii : number of data records (-1 if unknown, obey item 10 of the additional EDF+ specs) + 8 ascii : duration of a data record, in seconds + 4 ascii : number of signals (ns) in data record + ns * 16 ascii : ns * label (must be 'EDF Annotations') + ns * 80 ascii : ns * transducer type (must be whitespace) + ns * 8 ascii : ns * physical dimension (must be whitespace) + ns * 8 ascii : ns * physical minimum (e.g. -500 or 34, different than physical maximum) + ns * 8 ascii : ns * physical maximum (e.g. 500 or 40, different than physical minimum) + ns * 8 ascii : ns * digital minimum (must be -32768) + ns * 8 ascii : ns * digital maximum (must be 32767) + ns * 80 ascii : ns * prefiltering (must be whitespace) + ns * 8 ascii : ns * nr of samples in each data record + ns * 32 ascii : ns * reserved + + ANNOTATION RECORD + + Examples + -------- + >>> ann_info = wfdb.rdedfann('sample-data/test_edfann.edf') + + """ + # Some preliminary checks + if info_only and record_only: + raise Exception( + "Both `info_only` and `record_only` are set. Only one " + "can be set at a time." + ) + + # According to the EDF+ docs: + # "The coding is EDF compatible in the sense that old EDF software would + # simply treat this 'EDF Annotations' signal as if it were a (strange- + # looking) ordinary signal" + rec = read_edf( + record_name, + pn_dir=pn_dir, + delete_file=delete_file, + record_only=True, + rdedfann_flag=True, + ) + + # Convert from array of integers to ASCII strings + annotation_string = "" + for chunk in rec.p_signal.flatten().astype(np.int64): + if chunk + 1 == 0: + continue + else: + adjusted_hex = hex( + struct.unpack("H", chunk + 1))[0] + ) + annotation_string += bytes.fromhex(adjusted_hex[2:]).decode("ascii") + # Remove all of the whitespace + for rep in ["\x00", "\x14", "\x15"]: + annotation_string = annotation_string.replace(rep, " ") + + # Parse the resulting annotation string + onsets = [] + onset_times = [] + sample_nums = [] + comments = [] + durations = [] + all_anns = annotation_string.split("+") + for ann in all_anns: + if ann == "": + continue + try: + ann_split = ann.strip().split(" ") + onset = float(ann_split[0]) + hours, rem = divmod(onset, 3600) + minutes, seconds = divmod(rem, 60) + onset_time = f"{hours:02.0f}:{minutes:02.0f}:{seconds:06.3f}" + sample_num = int(onset * rec.sig_len) + duration = float(ann_split[1]) + comment = " ".join(ann_split[2:]) + if verbose: + print( + f"{onset_time}\t{sample_num}\t{comment}\t\tduration: {duration}" + ) + onsets.append(onset) + onset_times.append(onset_time) + sample_nums.append(sample_num) + comments.append(comment) + durations.append(duration) + except IndexError: + continue + + if info_only: + return { + "onset_time": onset_times, + "sample_num": sample_nums, + "comment": comments, + "duration": durations, + } + else: + df_in = pd.DataFrame( + data={ + "onset": onsets, + "duration": durations, + "description": comments, + } + ) + df_out = format_ann_from_df(df_in) + # Remove extension from input file name + record_name = record_name.split(os.sep)[-1].split(".")[0] + extension = "atr" + fs = rec.fs + sample = (df_out["onset"].to_numpy() * fs).astype(np.int64) + # Assume each annotation is a comment + symbol = ['"'] * len(df_out.index) + subtype = np.array([22] * len(df_out.index)) + # Assume each annotation belongs with the 1st channel + chan = np.array([0] * len(df_out.index)) + num = np.array([0] * len(df_out.index)) + aux_note = df_out["description"].tolist() + + if record_only: + return Annotation( + record_name=record_name, + extension=extension, + sample=sample, + symbol=symbol, + subtype=subtype, + chan=chan, + num=num, + aux_note=aux_note, + fs=fs, + ) + else: + wrann( + record_name, + extension, + sample=sample, + symbol=symbol, + subtype=subtype, + chan=chan, + num=num, + aux_note=aux_note, + fs=fs, + ) diff --git a/wfdb/io/convert/wav.py b/wfdb/io/convert/wav.py index eeecc3eb..22c45bc7 100644 --- a/wfdb/io/convert/wav.py +++ b/wfdb/io/convert/wav.py @@ -7,7 +7,7 @@ from wfdb.io import Record from wfdb.io import download from wfdb.io import _url -from wfdb.io.record import get_version, rdrecord +from wfdb.io.record import rdrecord def wfdb_to_wav( @@ -229,7 +229,7 @@ def read_wav(record_name, pn_dir=None, delete_file=True, record_only=False): if "." not in pn_dir: dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) file_url = posixpath.join(download.PN_INDEX_URL, pn_dir, record_name) diff --git a/wfdb/io/download.py b/wfdb/io/download.py index c9677294..06953495 100644 --- a/wfdb/io/download.py +++ b/wfdb/io/download.py @@ -1,12 +1,11 @@ +import json import multiprocessing -import numpy as np -import re import os import posixpath -import pdb -import json -from wfdb.io import record, _url +import numpy as np + +from wfdb.io import _url # The PhysioNet index url @@ -241,6 +240,34 @@ def get_dbs(): # ---- Helper functions for downloading PhysioNet files ------- # +def get_version(pn_dir): + """ + Get the version number of the desired project. + + Parameters + ---------- + pn_dir : str + The PhysioNet database directory from which to find the + required version number. eg. For the project 'mitdb' in + 'http://physionet.org/content/mitdb', pn_dir='mitdb'. + + Returns + ------- + version_number : str + The version number of the most recent database. + + """ + db_dir = pn_dir.split("/")[0] + url = posixpath.join(PN_CONTENT_URL, db_dir) + "/" + with _url.openurl(url, "rb") as f: + content = f.read() + contents = [line.decode("utf-8").strip() for line in content.splitlines()] + version_number = [v for v in contents if "Version:" in v] + version_number = version_number[0].split(":")[-1].strip().split("<")[0] + + return version_number + + def get_record_list(db_dir, records="all"): """ Get a list of records belonging to a database. @@ -267,7 +294,7 @@ def get_record_list(db_dir, records="all"): # Full url PhysioNet database if "/" not in db_dir: db_url = posixpath.join( - config.db_index_url, db_dir, record.get_version(db_dir) + config.db_index_url, db_dir, get_version(db_dir) ) else: db_url = posixpath.join(config.db_index_url, db_dir) @@ -514,7 +541,7 @@ def dl_files(db, dl_dir, files, keep_subdirs=True, overwrite=False): """ # Full url PhysioNet database - db_dir = posixpath.join(db, record.get_version(db)) + db_dir = posixpath.join(db, get_version(db)) db_url = posixpath.join(PN_CONTENT_URL, db_dir) + "/" # Check if the database is valid diff --git a/wfdb/io/record.py b/wfdb/io/record.py index 35e80b35..90cad6ed 100644 --- a/wfdb/io/record.py +++ b/wfdb/io/record.py @@ -1444,34 +1444,6 @@ def multi_to_single(self, physical, return_res=64): ) -def get_version(pn_dir): - """ - Get the version number of the desired project. - - Parameters - ---------- - pn_dir : str - The PhysioNet database directory from which to find the - required version number. eg. For the project 'mitdb' in - 'http://physionet.org/content/mitdb', pn_dir='mitdb'. - - Returns - ------- - version_number : str - The version number of the most recent database. - - """ - db_dir = pn_dir.split("/")[0] - url = posixpath.join(download.PN_CONTENT_URL, db_dir) + "/" - with _url.openurl(url, "rb") as f: - content = f.read() - contents = [line.decode("utf-8").strip() for line in content.splitlines()] - version_number = [v for v in contents if "Version:" in v] - version_number = version_number[0].split(":")[-1].strip().split("<")[0] - - return version_number - - def _check_item_type( item, field_name, allowed_types, expect_list=False, required_channels="all" ): @@ -1622,7 +1594,7 @@ def rdheader(record_name, pn_dir=None, rd_segments=False): if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) # Read the header file. Separate comment and non-comment lines @@ -1804,7 +1776,7 @@ def rdrecord( if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) record = rdheader(record_name, pn_dir=pn_dir, rd_segments=False) @@ -2089,7 +2061,7 @@ def rdsamp( if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) record = rdrecord( @@ -2159,7 +2131,7 @@ def sampfreq(record_name, pn_dir=None): if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) record = rdheader(record_name, pn_dir=pn_dir) @@ -2212,7 +2184,7 @@ def signame(record_name, pn_dir=None, sig_nums=[]): if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) record = rdheader(record_name, pn_dir=pn_dir) @@ -2292,7 +2264,7 @@ def wfdbdesc(record_name, pn_dir=None): if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) record = rdheader(record_name, pn_dir=pn_dir) @@ -2423,7 +2395,7 @@ def wfdbtime(record_name, input_times, pn_dir=None): if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) record = rdheader(record_name, pn_dir=pn_dir) @@ -2832,10 +2804,10 @@ def dl_database( if "/" in db_dir: dir_list = db_dir.split("/") db_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) else: - db_dir = posixpath.join(db_dir, get_version(db_dir)) + db_dir = posixpath.join(db_dir, download.get_version(db_dir)) db_url = posixpath.join(download.PN_CONTENT_URL, db_dir) + "/" # Check if the database is valid _url.openurl(db_url, check_access=True) diff --git a/wfdb/processing/__init__.py b/wfdb/processing/__init__.py index 657f9311..295aa89e 100644 --- a/wfdb/processing/__init__.py +++ b/wfdb/processing/__init__.py @@ -11,7 +11,7 @@ compare_annotations, benchmark_mitdb, ) -from wfdb.processing.hr import compute_hr, calc_rr, calc_mean_hr +from wfdb.processing.hr import compute_hr, calc_rr, calc_mean_hr, ann2rr, rr2ann from wfdb.processing.peaks import find_peaks, find_local_peaks, correct_peaks from wfdb.processing.qrs import XQRS, xqrs_detect, gqrs_detect from wfdb.processing.filter import sigavg diff --git a/wfdb/processing/filter.py b/wfdb/processing/filter.py index 795addfb..f8739b73 100644 --- a/wfdb/processing/filter.py +++ b/wfdb/processing/filter.py @@ -4,7 +4,8 @@ import pandas as pd from wfdb.io import annotation -from wfdb.io.record import get_version, rdrecord +from wfdb.io import download +from wfdb.io.record import rdrecord def sigavg( @@ -94,7 +95,7 @@ def sigavg( if (pn_dir is not None) and ("." not in pn_dir): dir_list = pn_dir.split("/") pn_dir = posixpath.join( - dir_list[0], get_version(dir_list[0]), *dir_list[1:] + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] ) rec = rdrecord(record_name, pn_dir=pn_dir, physical=False) diff --git a/wfdb/processing/hr.py b/wfdb/processing/hr.py index 1156668c..437da732 100644 --- a/wfdb/processing/hr.py +++ b/wfdb/processing/hr.py @@ -1,5 +1,10 @@ +import posixpath + import numpy as np +from wfdb.io.annotation import rdann, wrann +from wfdb.io import download + def compute_hr(sig_len, qrs_inds, fs): """ @@ -146,3 +151,154 @@ def calc_mean_hr(rr, fs=None, min_rr=None, max_rr=None, rr_units="samples"): mean_hr = mean_hr * fs return mean_hr + + +def ann2rr( + record_name, + extension, + pn_dir=None, + start_time=None, + stop_time=None, + format=None, + as_array=True, +): + + """ + Obtain RR interval series from ECG annotation files. + + Parameters + ---------- + record_name : str + The record name of the WFDB annotation file. ie. for file '100.atr', + record_name='100'. + extension : str + The annotatator extension of the annotation file. ie. for file + '100.atr', extension='atr'. + pn_dir : str, optional + Option used to stream data from Physionet. The PhysioNet database + directory from which to find the required annotation file. eg. For + record '100' in 'http://physionet.org/content/mitdb': pn_dir='mitdb'. + start_time : float, optional + The time to start the intervals in seconds. + stop_time : float, optional + The time to stop the intervals in seconds. + format : str, optional + Print intervals in the specified format. By default, intervals are + printed in units of sample intervals. Other formats include + 's' (seconds), 'm' (minutes), 'h' (hours). Set to 'None' for samples. + as_array : bool, optional + If True, return an an 'ndarray', else print the output. + + Returns + ------- + N/A + + Examples + -------- + >>> wfdb.ann2rr('sample-data/100', 'atr', as_array=False) + >>> 18 + >>> 59 + >>> ... + >>> 250 + >>> 257 + + """ + if (pn_dir is not None) and ("." not in pn_dir): + dir_list = pn_dir.split("/") + pn_dir = posixpath.join( + dir_list[0], download.get_version(dir_list[0]), *dir_list[1:] + ) + + ann = rdann(record_name, extension, pn_dir=pn_dir) + + rr_interval = calc_rr(ann.sample, fs=ann.fs) + rr_interval = np.insert(rr_interval, 0, ann.sample[0]) + + time_interval = rr_interval / ann.fs + if start_time is not None: + time_interval = time_interval[(time_interval > start_time).astype(bool)] + if stop_time is not None: + time_interval = time_interval[(time_interval < stop_time).astype(bool)] + + # Already given in seconds (format == 's') + if format == "s": + out_interval = time_interval + elif format == "m": + out_interval = time_interval / 60 + elif format == "h": + out_interval = time_interval / (60 * 60) + else: + out_interval = np.around(time_interval * ann.fs).astype(np.int) + + if as_array: + return out_interval + else: + print(*out_interval, sep="\n") + + +def rr2ann(rr_array, record_name, extension, fs=250, as_time=False): + """ + Creates an annotation file from the standard input, which should usually + be a Numpy array of intervals in the format produced by `ann2rr`. (For + exceptions, see the `as_time` parameter below.). An optional second column + may be provided which gives the respective annotation mnemonic. + + Parameters + ---------- + rr_array : ndarray + A Numpy array consisting of the input RR intervals. If `as_time` is + set to True, then the input should consist of times of occurences. If, + the shape of the input array is '(n_annot,2)', then treat the second + column as the annotation mnemonic ('N', 'V', etc.). If a second column + is not specified, then the default annotation will the '"' which + specifies a comment. + record_name : str + The record name of the WFDB annotation file. ie. for file '100.atr', + record_name='100'. + extension : str + The annotatator extension of the annotation file. ie. for file + '100.atr', extension='atr'. + fs : float, int, optional + Assume the specified sampling frequency. This option has no effect + unless the `as_time` parameter is set to convert to samples; in this + case, a sampling frequency of 250 Hz is assumed if this option is + omitted. + as_time : bool + Interpret the input as times of occurrence (if True), rather than as + samples (if False). There is not currently a way to input RR intervals + in time format between beats. For example, 0.2 seconds between beats + 1->2, 0.3 seconds between beats 2->3, etc. + + Returns + ------- + N/A + + Examples + -------- + Using time of occurence as input: + >>> import numpy as np + >>> rr_array = np.array([[0.2, 0.6, 1.3], ['V', 'N', 'V']]).T + >>> wfdb.rr2ann(rr_array, 'test_ann', 'atr', fs=100, as_time=True) + + Using samples as input: + >>> import numpy as np + >>> rr_array = np.array([4, 17, 18, 16]) + >>> wfdb.rr2ann(rr_array, 'test_ann', 'atr') + + """ + try: + ann_sample = rr_array[:, 0] + except IndexError: + ann_sample = rr_array + + if as_time: + ann_sample = (fs * ann_sample.astype(np.float64)).astype(np.int64) + else: + ann_sample = np.cumsum(ann_sample).astype(np.int64) + + try: + ann_symbol = rr_array[:, 1].tolist() + except IndexError: + ann_symbol = rr_array.shape[0] * ['"'] + + wrann(record_name, extension, ann_sample, symbol=ann_symbol) diff --git a/wfdb/processing/peaks.py b/wfdb/processing/peaks.py index 8a9fc032..dd276950 100644 --- a/wfdb/processing/peaks.py +++ b/wfdb/processing/peaks.py @@ -1,4 +1,3 @@ -import copy import numpy as np from wfdb.processing.basic import smooth diff --git a/wfdb/processing/qrs.py b/wfdb/processing/qrs.py index 70e95fc7..ed0c606d 100644 --- a/wfdb/processing/qrs.py +++ b/wfdb/processing/qrs.py @@ -1,5 +1,4 @@ import copy -import pdb import numpy as np from scipy import signal From 29b03790961bd97c5fd16808c75757703227e65a Mon Sep 17 00:00:00 2001 From: Chen Xie Date: Tue, 3 May 2022 22:37:31 -0700 Subject: [PATCH 3/3] Update documentation --- docs/convert.rst | 35 +++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/io.rst | 4 ++-- docs/processing.rst | 2 +- docs/wfdb.rst | 2 +- 5 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 docs/convert.rst diff --git a/docs/convert.rst b/docs/convert.rst new file mode 100644 index 00000000..1fa8f570 --- /dev/null +++ b/docs/convert.rst @@ -0,0 +1,35 @@ +Format Conversions +================== + +The wfdb.io.convert subpackage contains functions to read and write +non-WFDB files commonly used for waveforms. + +CSV +--- + +.. automodule:: wfdb.io.convert.csv + :members: csv_to_wfdb + +EDF +--- + +.. automodule:: wfdb.io.convert.edf + :members: + +Matlab +------ + +.. automodule:: wfdb.io.convert.matlab + :members: + +TFF +--- + +.. automodule:: wfdb.io.convert.tff + :members: + +WAV +--- + +.. automodule:: wfdb.io.convert.wav + :members: diff --git a/docs/index.rst b/docs/index.rst index e6726a99..e167ad4a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -53,6 +53,7 @@ Other Content installation wfdb-specifications + convert Indices and tables diff --git a/docs/io.rst b/docs/io.rst index dacfdea4..01981c16 100644 --- a/docs/io.rst +++ b/docs/io.rst @@ -9,7 +9,7 @@ WFDB Records --------------- .. automodule:: wfdb.io - :members: rdrecord, rdheader, rdsamp, wrsamp, read_edf, wfdb_to_edf, read_wav, wfdb_to_wav, csv_to_wfdb, wfdb_to_mat + :members: rdrecord, rdheader, rdsamp, wrsamp .. autoclass:: wfdb.io.Record :members: wrsamp, adc, dac @@ -32,4 +32,4 @@ Downloading ----------- .. automodule:: wfdb.io - :members: get_dbs, get_record_list, dl_database, dl_files, set_db_index_url + :members: dl_files, dl_database, get_dbs, get_record_list, set_db_index_url diff --git a/docs/processing.rst b/docs/processing.rst index 61e8013f..96c1a19a 100644 --- a/docs/processing.rst +++ b/docs/processing.rst @@ -17,7 +17,7 @@ Heart Rate ---------- .. automodule:: wfdb.processing - :members: compute_hr + :members: compute_hr, calc_rr, calc_mean_hr, ann2rr, rr2ann Peaks diff --git a/docs/wfdb.rst b/docs/wfdb.rst index 7580694f..3d3fd67b 100644 --- a/docs/wfdb.rst +++ b/docs/wfdb.rst @@ -32,7 +32,7 @@ Downloading ----------- .. automodule:: wfdb - :members: get_dbs, get_record_list, dl_database, dl_files, set_db_index_url + :members: dl_files, dl_database, get_dbs, get_record_list, set_db_index_url Plotting