Skip to content

Commit b22ce2e

Browse files
committed
Add data.write method, bump version
* Data.write to save CSV of reformmated input * Fix multiple deprecation warnings mainly pandas related (ver 2) * fix bug with vp, vpd, t_dew calcs not persisting on initial data access
1 parent 62bb918 commit b22ce2e

File tree

5 files changed

+116
-27
lines changed

5 files changed

+116
-27
lines changed

CHANGELOG.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Change Log
22
==========
33

4+
Version 0.2.2
5+
-------------
6+
7+
Added method to write input data to a CSV file following the same standardized formatting and unit conversions that are implemented in ``qaqc.write``. This method is ``data.write``. This was done so that a user can easily rewrite the initially read data at its native time frequency that is often half-hourly or hourly as produced by eddy covariance processing software such as EddyPro. This is useful for creating input for sub-daily time series analyses that may be done in conjunction with ``flux-data-qaqc``.
8+
9+
Bug fixes related to internal automatic calculations for vapor pressure, vapor pressure deficit, saturation vapor pressure, and dew point temperature and data assignment not persisting until two calls to ``data.df``.
10+
11+
Fix multiple deprecation warnings caused by ``Pandas`` version 2, tested with version 2.2.2.
12+
413
Version 0.2.1
514
-------------
615

fluxdataqaqc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
__name__ = 'fluxdataqaqc'
66
__author__ = 'John Volk'
7-
__version__ = '0.2.1'
7+
__version__ = '0.2.2'
88

99

1010
from fluxdataqaqc.data import Data

fluxdataqaqc/data.py

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -232,11 +232,11 @@ def hourly_ASCE_refET(self, reference='short', anemometer_height=None):
232232
print(
233233
'Resampling ASCE reference ET input variables to hourly means'
234234
)
235-
tmean = df.t_avg.resample('H').mean()
235+
tmean = df.t_avg.resample('h').mean()
236236
length = len(tmean)
237-
rs = df.sw_in.resample('H').mean()
238-
ea = df.vp.resample('H').mean()
239-
uz = df.ws.resample('H').mean()
237+
rs = df.sw_in.resample('h').mean()
238+
ea = df.vp.resample('h').mean()
239+
uz = df.ws.resample('h').mean()
240240
zw = anemometer_height
241241
lat = np.full(length, self.latitude)
242242
lon = np.full(length, self.longitude)
@@ -326,6 +326,9 @@ def _calc_rn(self, df):
326326
df['Rn'] = df.sw_in + df.lw_in - df.sw_out - df.lw_out
327327
self.variables['Rn'] = 'Rn'
328328
self.units['Rn'] = 'w/m2'
329+
self.inv_map = {
330+
v: k for k, v in self.variables.items() if not k == v
331+
}
329332

330333
self._df = df
331334

@@ -367,7 +370,8 @@ def _calc_vpd_or_vp(self, df):
367370

368371
# calculate vpd from actual vapor pressure and temp
369372
# check if needed variables exist and units are correct
370-
has_vpd_vars = set(['vp','t_avg']).issubset(df.columns)
373+
has_vpd_vars = set(['vp','t_avg']).issubset(df.columns) and not\
374+
set(['vpd','es']).issubset(df.columns)
371375
units_correct = (
372376
self.units.get('vp') == 'kpa' and self.units.get('t_avg') == 'c'
373377
)
@@ -386,7 +390,9 @@ def _calc_vpd_or_vp(self, df):
386390
self.units['es'] = 'kpa'
387391

388392
# same calc actual vapor pressure from vapor pressure deficit and temp
389-
has_vp_vars = set(['vpd','t_avg']).issubset(df.columns)
393+
has_vp_vars = set(['vpd','t_avg']).issubset(df.columns) and not\
394+
set(['vp','es']).issubset(df.columns)
395+
390396
units_correct = (
391397
self.units.get('vpd') == 'kpa' and self.units.get('t_avg') == 'c'
392398
)
@@ -416,14 +422,20 @@ def _calc_vpd_or_vp(self, df):
416422
self.variables['rh'] = 'rh'
417423
self.units['rh'] = '%'
418424

419-
if 'vp' in self.variables and self.units.get('vp') == 'kpa':
425+
if 'vp' in self.variables and self.units.get('vp') == 'kpa' and not\
426+
't_dew' in df.columns:
420427
print(
421428
'Calculating dew point temperature from vapor pressure'
422429
)
423430
df['t_dew'] = (-1 / ((np.log(df.vp/.611) / 5423) - (1/273)))-273.15
424431
self.variables['t_dew'] = 't_dew'
425432
self.units['t_dew'] = 'c'
426433

434+
435+
self.inv_map = {
436+
v: k for k, v in self.variables.items() if not k == v
437+
}
438+
427439
self._df = df
428440

429441

@@ -549,6 +561,62 @@ def plot(self, ncols=1, output_type='save', out_file=None, suptitle='',
549561
if ret:
550562
return ret
551563

564+
def write(self, out_dir=None, use_input_names=False):
565+
"""
566+
Save time series of initially read in data after performing default
567+
naming formatting and unit conversions, save as CSV file. File name
568+
will be in the format "[site_ID]_input_data.csv".
569+
570+
The default location for saving output time series files is within an
571+
"output" subdirectory of the parent directory containing the
572+
config.ini file.
573+
574+
Keyword Arguments:
575+
out_dir (str or :obj:`None`): default :obj:`None`. Directory to
576+
save CSVs, if :obj:`None` save to :attr:`out_dir` instance
577+
variable (typically "output" directory where config.ini file
578+
exists).
579+
use_input_names (bool): default :obj:`False`. If :obj:`False` use
580+
``flux-data-qaqc`` variable names as in output file header,
581+
or if :obj:`True` use the user's input variable names where
582+
possible (for variables that were read in and not modified or
583+
calculated by ``flux-data-qaqc``).
584+
585+
Returns:
586+
:obj:`None`
587+
588+
Example:
589+
590+
Starting from a config.ini file,
591+
592+
>>> from fluxdataqaqc import Data, QaQc
593+
>>> d = Data('path/to/config.ini')
594+
>>> d.write()
595+
596+
"""
597+
598+
if out_dir is None:
599+
out_dir = self.out_dir
600+
else:
601+
out_dir = Path(out_dir)
602+
self.out_dir = out_dir.absolute()
603+
604+
if not out_dir.is_dir():
605+
print(
606+
'{} does not exist, creating directory'.format(
607+
out_dir.absolute()
608+
)
609+
)
610+
out_dir.mkdir(parents=True, exist_ok=True)
611+
612+
input_outf = out_dir / '{}_input_data.csv'.format(self.site_id)
613+
#self.df.head(); # creates vp/vpd if df has not been called yet.
614+
615+
if use_input_names:
616+
self.df.to_csv(input_outf)
617+
else:
618+
self.df.rename(columns=self.inv_map).to_csv(input_outf)
619+
552620
def _load_config(self, config_file):
553621
if not config_file.is_file():
554622
raise FileNotFoundError('ERROR: config file not found')
@@ -946,7 +1014,7 @@ def apply_qc_flags(self, threshold=None, flag=None,
9461014

9471015
@property
9481016
def df(self):
949-
"""
1017+
r"""
9501018
Pull variables out of the config and climate time series files load
9511019
them into a datetime-indexed :obj:`pandas.DataFrame`.
9521020
@@ -1310,10 +1378,10 @@ def calc_weight_avg(d, pref, df):
13101378
self._df = df # vpd calc uses attribute
13111379
# calc vapor pressure or vapor pressure deficit if hourly or less
13121380
# also converts units if needed for vp, vpd, t_avg
1313-
self._calc_vpd_or_vp(df)
1314-
self._calc_rn(df)
1381+
self._calc_vpd_or_vp(self._df)
1382+
self._calc_rn(self._df)
13151383

1316-
return df
1384+
return self._df.rename(columns=self.variables)
13171385

13181386
@df.setter
13191387
def df(self, data_frame):

fluxdataqaqc/qaqc.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
696696
].copy()
697697
grped_night.drop_duplicates(inplace=True)
698698
grped_night = grped_night.groupby(
699-
pd.Grouper(freq='24H', offset='12H'),
699+
pd.Grouper(freq='24h', offset='12h'),
700700
group_keys=True).apply(
701701
lambda x: x.interpolate(
702702
method='linear', limit=max_night_gap,
@@ -706,7 +706,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
706706
grped_day = tmp.loc[(tmp.Rn >= 0) | (tmp.Rn.isna())].copy()
707707
grped_day.drop_duplicates(inplace=True)
708708
grped_day = grped_day.groupby(
709-
pd.Grouper(freq='24H'),
709+
pd.Grouper(freq='24h'),
710710
group_keys=True).apply(
711711
lambda x: x.interpolate(
712712
method='linear', limit=max_gap,
@@ -717,7 +717,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
717717
grped_night = tmp.copy()
718718
grped_night.drop_duplicates(inplace=True)
719719
grped_night = grped_night.groupby(
720-
pd.Grouper(freq='24H', offset='12H'),
720+
pd.Grouper(freq='24h', offset='12h'),
721721
group_keys=True).apply(
722722
lambda x: x.interpolate(
723723
method='linear', limit=max_night_gap,
@@ -727,7 +727,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
727727
grped_day = tmp.copy()
728728
grped_day.drop_duplicates(inplace=True)
729729
grped_day = grped_day.groupby(
730-
pd.Grouper(freq='24H'),
730+
pd.Grouper(freq='24h'),
731731
group_keys=True).apply(
732732
lambda x: x.interpolate(
733733
method='linear', limit=max_gap,
@@ -1271,9 +1271,13 @@ def _ET_gap_fill(self, et_name='ET_corr', refET='ETr'):
12711271
Q1 = df['ETrF_filtered'].quantile(0.25)
12721272
Q3 = df['ETrF_filtered'].quantile(0.75)
12731273
IQR = Q3 - Q1
1274-
to_filter = df.query(
1275-
'ETrF_filtered<(@Q1-1.5*@IQR) or ETrF_filtered>(@Q3+1.5*@IQR)'
1276-
)
1274+
mask = (
1275+
df['ETrF_filtered'] < (Q1 - 1.5 * IQR)
1276+
) | \
1277+
(
1278+
df['ETrF_filtered'] > (Q3 + 1.5 * IQR)
1279+
)
1280+
to_filter = df[mask]
12771281
df.loc[to_filter.index, 'ETrF_filtered'] = np.nan
12781282
df['ETrF_filtered'] = df.ETrF_filtered.rolling(
12791283
7, min_periods=2, center=True
@@ -1294,9 +1298,13 @@ def _ET_gap_fill(self, et_name='ET_corr', refET='ETr'):
12941298
Q1 = df['EToF_filtered'].quantile(0.25)
12951299
Q3 = df['EToF_filtered'].quantile(0.75)
12961300
IQR = Q3 - Q1
1297-
to_filter = df.query(
1298-
'EToF_filtered<(@Q1-1.5*@IQR) or EToF_filtered>(@Q3+1.5*@IQR)'
1299-
)
1301+
mask = (
1302+
df['EToF_filtered'] < (Q1 - 1.5 * IQR)
1303+
) | \
1304+
(
1305+
df['EToF_filtered'] > (Q3 + 1.5 * IQR)
1306+
)
1307+
to_filter = df[mask]
13001308
df.loc[to_filter.index, 'EToF_filtered'] = np.nan
13011309
df['EToF_filtered'] = df.EToF_filtered.rolling(
13021310
7, min_periods=2, center=True
@@ -1652,13 +1660,15 @@ def _ebr_correction(self):
16521660
# make copy of original data for later
16531661
orig_df = df[['LE','H','Rn','G']].astype(float).copy()
16541662
orig_df['ebr'] = (orig_df.H + orig_df.LE) / (orig_df.Rn - orig_df.G)
1663+
16551664
# compute IQR to filter out extreme ebrs,
16561665
df['ebr'] = (df.H + df.LE) / (df.Rn - df.G)
16571666
Q1 = df['ebr'].quantile(0.25)
16581667
Q3 = df['ebr'].quantile(0.75)
16591668
IQR = Q3 - Q1
16601669
# filter values between Q1-1.5IQR and Q3+1.5IQR
1661-
filtered = df.query('(@Q1 - 1.5 * @IQR) <= ebr <= (@Q3 + 1.5 * @IQR)')
1670+
mask = (df['ebr'] >= (Q1 - 1.5 * IQR)) & (df['ebr'] <= (Q3 + 1.5 * IQR))
1671+
filtered = df[mask]
16621672
# apply filter
16631673
filtered_mask = filtered.index
16641674
removed_mask = set(df.index) - set(filtered_mask)
@@ -1717,12 +1727,13 @@ def _ebr_correction(self):
17171727
df['DOY'] = df.index.dayofyear
17181728
# datetime indices of all remaining null elements
17191729
null_dates = df.loc[df.ebr_corr.isnull(), 'ebr_corr'].index
1730+
17201731
merged = pd.merge(
17211732
df, ebr_5day_clim, left_on='DOY', right_index=True
17221733
)
17231734
# assign 5 day climatology of EBR
17241735
merged.loc[null_dates,'ebr_corr'] =\
1725-
merged.loc[null_dates,'ebr_5day_clim']
1736+
merged.loc[null_dates,'ebr_5day_clim'].astype(float)
17261737
# replace raw variables with unfiltered dataframe copy
17271738
merged.LE = orig_df.LE
17281739
merged.H = orig_df.H
@@ -1767,7 +1778,8 @@ def _ebr_correction(self):
17671778
cols = list(set(merged.columns).difference(df.columns))
17681779
# join calculated data in
17691780
merged = df.join(merged[cols], how='outer')
1770-
merged.drop('DOY', axis=1, inplace=True)
1781+
# remove merge columns with suffix
1782+
merged = merged.drop(columns=['DOY_x','DOY_y'])
17711783

17721784
self.variables.update(
17731785
energy = 'energy',

fluxdataqaqc/util.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,11 @@ def monthly_resample(df, cols, agg_str, thresh=0.75):
181181
with the months daily mean before summation.
182182
"""
183183
if agg_str == 'sum':
184-
mdf = df.loc[:,cols].apply(pd.to_numeric).resample('M').agg(
184+
mdf = df.loc[:,cols].apply(pd.to_numeric).resample('ME').agg(
185185
[agg_str, 'count', 'mean']
186186
)
187187
else:
188-
mdf = df.loc[:,cols].apply(pd.to_numeric).resample('M').agg(
188+
mdf = df.loc[:,cols].apply(pd.to_numeric).resample('ME').agg(
189189
[agg_str, 'count']
190190
)
191191

0 commit comments

Comments
 (0)