Add data.write method, bump version

JohnVolk · JohnVolk · commit b22ce2e71f75 · 2024-12-08T23:12:47.000-05:00
* Data.write to save CSV of reformmated input
* Fix multiple deprecation warnings mainly pandas related (ver 2)
* fix bug with vp, vpd, t_dew calcs not persisting on initial data access
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,15 @@
 Change Log
 ==========
 
+Version 0.2.2
+-------------
+
+Added method to write input data to a CSV file following the same standardized formatting and unit conversions that are implemented in ``qaqc.write``. This method is ``data.write``. This was done so that a user can easily rewrite the initially read data at its native time frequency that is often half-hourly or hourly as produced by eddy covariance processing software such as EddyPro. This is useful for creating input for sub-daily time series analyses that may be done in conjunction with ``flux-data-qaqc``.
+
+Bug fixes related to internal automatic calculations for vapor pressure, vapor pressure deficit, saturation vapor pressure, and dew point temperature and data assignment not persisting until two calls to ``data.df``. 
+
+Fix multiple deprecation warnings caused by ``Pandas`` version 2, tested with version 2.2.2. 
+
 Version 0.2.1
 -------------
 
diff --git a/fluxdataqaqc/__init__.py b/fluxdataqaqc/__init__.py
@@ -4,7 +4,7 @@
 
 __name__ = 'fluxdataqaqc'
 __author__ = 'John Volk'
-__version__ = '0.2.1'
+__version__ = '0.2.2'
 
 
 from fluxdataqaqc.data import Data
diff --git a/fluxdataqaqc/data.py b/fluxdataqaqc/data.py
@@ -232,11 +232,11 @@ def hourly_ASCE_refET(self, reference='short', anemometer_height=None):
             print(
                 'Resampling ASCE reference ET input variables to hourly means'
             )
-            tmean = df.t_avg.resample('H').mean()
+            tmean = df.t_avg.resample('h').mean()
             length = len(tmean)
-            rs = df.sw_in.resample('H').mean()
-            ea = df.vp.resample('H').mean()
-            uz = df.ws.resample('H').mean()
+            rs = df.sw_in.resample('h').mean()
+            ea = df.vp.resample('h').mean()
+            uz = df.ws.resample('h').mean()
             zw = anemometer_height
             lat = np.full(length, self.latitude)
             lon = np.full(length, self.longitude)
@@ -326,6 +326,9 @@ def _calc_rn(self, df):
             df['Rn'] = df.sw_in + df.lw_in - df.sw_out - df.lw_out
             self.variables['Rn'] = 'Rn'
             self.units['Rn'] = 'w/m2'
+            self.inv_map = {
+                v: k for k, v in self.variables.items() if not k == v
+            }
 
         self._df = df
 
@@ -367,7 +370,8 @@ def _calc_vpd_or_vp(self, df):
 
         # calculate vpd from actual vapor pressure and temp
         # check if needed variables exist and units are correct
-        has_vpd_vars = set(['vp','t_avg']).issubset(df.columns)
+        has_vpd_vars = set(['vp','t_avg']).issubset(df.columns) and not\
+                set(['vpd','es']).issubset(df.columns)
         units_correct = (
             self.units.get('vp') == 'kpa' and self.units.get('t_avg') == 'c'
         )
@@ -386,7 +390,9 @@ def _calc_vpd_or_vp(self, df):
             self.units['es'] = 'kpa'
 
         # same calc actual vapor pressure from vapor pressure deficit and temp
-        has_vp_vars = set(['vpd','t_avg']).issubset(df.columns)
+        has_vp_vars = set(['vpd','t_avg']).issubset(df.columns) and not\
+                set(['vp','es']).issubset(df.columns)
+
         units_correct = (
             self.units.get('vpd') == 'kpa' and self.units.get('t_avg') == 'c'
         )
@@ -416,14 +422,20 @@ def _calc_vpd_or_vp(self, df):
                 self.variables['rh'] = 'rh'
                 self.units['rh'] = '%'
         
-        if 'vp' in self.variables and self.units.get('vp') == 'kpa':
+        if 'vp' in self.variables and self.units.get('vp') == 'kpa' and not\
+                't_dew' in df.columns:
             print(
                 'Calculating dew point temperature from vapor pressure'
             )
             df['t_dew'] = (-1 / ((np.log(df.vp/.611) / 5423) - (1/273)))-273.15
             self.variables['t_dew'] = 't_dew'
             self.units['t_dew'] = 'c'
 
+
+        self.inv_map = {
+            v: k for k, v in self.variables.items() if not k == v
+        }
+
         self._df = df
 
 
@@ -549,6 +561,62 @@ def plot(self, ncols=1, output_type='save', out_file=None, suptitle='',
         if ret:
             return ret
 
+    def write(self, out_dir=None, use_input_names=False):
+        """
+        Save time series of initially read in data after performing default 
+        naming formatting and unit conversions, save as CSV file. File name
+        will be in the format "[site_ID]_input_data.csv".
+
+        The default location for saving output time series files is within an
+        "output" subdirectory of the parent directory containing the
+        config.ini file. 
+
+        Keyword Arguments:
+            out_dir (str or :obj:`None`): default :obj:`None`. Directory to 
+                save CSVs, if :obj:`None` save to :attr:`out_dir` instance 
+                variable (typically "output" directory where config.ini file 
+                exists).
+            use_input_names (bool): default :obj:`False`. If :obj:`False` use 
+                ``flux-data-qaqc`` variable names as in output file header,
+                or if :obj:`True` use the user's input variable names where
+                possible (for variables that were read in and not modified or
+                calculated by ``flux-data-qaqc``).
+
+        Returns:
+            :obj:`None`
+
+        Example:
+            
+            Starting from a config.ini file,
+
+            >>> from fluxdataqaqc import Data, QaQc
+            >>> d = Data('path/to/config.ini')
+            >>> d.write()
+
+        """
+
+        if out_dir is None:
+            out_dir = self.out_dir
+        else:
+            out_dir = Path(out_dir)
+            self.out_dir = out_dir.absolute()
+
+        if not out_dir.is_dir():
+            print(
+                '{} does not exist, creating directory'.format(
+                    out_dir.absolute()
+                )
+            )
+            out_dir.mkdir(parents=True, exist_ok=True)
+
+        input_outf = out_dir / '{}_input_data.csv'.format(self.site_id)
+        #self.df.head(); # creates vp/vpd if df has not been called yet.
+
+        if use_input_names:
+            self.df.to_csv(input_outf)
+        else:
+            self.df.rename(columns=self.inv_map).to_csv(input_outf)
+
     def _load_config(self, config_file):
         if not config_file.is_file():
             raise FileNotFoundError('ERROR: config file not found')
@@ -946,7 +1014,7 @@ def apply_qc_flags(self, threshold=None, flag=None,
 
     @property
     def df(self):
-        """
+        r"""
         Pull variables out of the config and climate time series files load 
         them into a datetime-indexed :obj:`pandas.DataFrame`. 
 
@@ -1310,10 +1378,10 @@ def calc_weight_avg(d, pref, df):
         self._df = df # vpd calc uses attribute
         # calc vapor pressure or vapor pressure deficit if hourly or less
         # also converts units if needed for vp, vpd, t_avg
-        self._calc_vpd_or_vp(df)
-        self._calc_rn(df)
+        self._calc_vpd_or_vp(self._df)
+        self._calc_rn(self._df)
 
-        return df
+        return self._df.rename(columns=self.variables)
 
     @df.setter
     def df(self, data_frame):
diff --git a/fluxdataqaqc/qaqc.py b/fluxdataqaqc/qaqc.py
@@ -696,7 +696,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
                     ].copy()
                     grped_night.drop_duplicates(inplace=True)
                     grped_night = grped_night.groupby(
-                        pd.Grouper(freq='24H', offset='12H'), 
+                        pd.Grouper(freq='24h', offset='12h'), 
                             group_keys=True).apply(
                                 lambda x: x.interpolate(
                                     method='linear', limit=max_night_gap, 
@@ -706,7 +706,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
                     grped_day = tmp.loc[(tmp.Rn >= 0) | (tmp.Rn.isna())].copy()
                     grped_day.drop_duplicates(inplace=True)
                     grped_day = grped_day.groupby(
-                        pd.Grouper(freq='24H'),
+                        pd.Grouper(freq='24h'),
                             group_keys=True).apply(
                                 lambda x: x.interpolate(
                                     method='linear', limit=max_gap, 
@@ -717,7 +717,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
                     grped_night = tmp.copy()
                     grped_night.drop_duplicates(inplace=True)
                     grped_night = grped_night.groupby(
-                        pd.Grouper(freq='24H', offset='12H'),
+                        pd.Grouper(freq='24h', offset='12h'),
                             group_keys=True).apply(
                                 lambda x: x.interpolate(
                                     method='linear', limit=max_night_gap, 
@@ -727,7 +727,7 @@ def _check_daily_freq(self, drop_gaps, daily_frac, max_interp_hours,
                     grped_day = tmp.copy()
                     grped_day.drop_duplicates(inplace=True)
                     grped_day = grped_day.groupby(
-                        pd.Grouper(freq='24H'),
+                        pd.Grouper(freq='24h'),
                             group_keys=True).apply(
                                 lambda x: x.interpolate(
                                     method='linear', limit=max_gap, 
@@ -1271,9 +1271,13 @@ def _ET_gap_fill(self, et_name='ET_corr', refET='ETr'):
             Q1 = df['ETrF_filtered'].quantile(0.25)
             Q3 = df['ETrF_filtered'].quantile(0.75)
             IQR = Q3 - Q1
-            to_filter = df.query(
-                'ETrF_filtered<(@Q1-1.5*@IQR) or ETrF_filtered>(@Q3+1.5*@IQR)'
-            )
+            mask = (
+                    df['ETrF_filtered'] < (Q1 - 1.5 * IQR)
+                ) | \
+                (
+                    df['ETrF_filtered'] > (Q3 + 1.5 * IQR)
+                )
+            to_filter = df[mask]
             df.loc[to_filter.index, 'ETrF_filtered'] = np.nan
             df['ETrF_filtered'] = df.ETrF_filtered.rolling(
                 7, min_periods=2, center=True
@@ -1294,9 +1298,13 @@ def _ET_gap_fill(self, et_name='ET_corr', refET='ETr'):
             Q1 = df['EToF_filtered'].quantile(0.25)
             Q3 = df['EToF_filtered'].quantile(0.75)
             IQR = Q3 - Q1
-            to_filter = df.query(
-                'EToF_filtered<(@Q1-1.5*@IQR) or EToF_filtered>(@Q3+1.5*@IQR)'
-            )
+            mask = (
+                    df['EToF_filtered'] < (Q1 - 1.5 * IQR)
+                ) | \
+                (
+                    df['EToF_filtered'] > (Q3 + 1.5 * IQR)
+                )
+            to_filter = df[mask]
             df.loc[to_filter.index, 'EToF_filtered'] = np.nan
             df['EToF_filtered'] = df.EToF_filtered.rolling(
                 7, min_periods=2, center=True
@@ -1652,13 +1660,15 @@ def _ebr_correction(self):
         # make copy of original data for later
         orig_df = df[['LE','H','Rn','G']].astype(float).copy()
         orig_df['ebr'] =  (orig_df.H + orig_df.LE) / (orig_df.Rn - orig_df.G)
+
         # compute IQR to filter out extreme ebrs, 
         df['ebr'] = (df.H + df.LE) / (df.Rn - df.G)
         Q1 = df['ebr'].quantile(0.25)
         Q3 = df['ebr'].quantile(0.75)
         IQR = Q3 - Q1
         # filter values between Q1-1.5IQR and Q3+1.5IQR
-        filtered = df.query('(@Q1 - 1.5 * @IQR) <= ebr <= (@Q3 + 1.5 * @IQR)')
+        mask = (df['ebr'] >= (Q1 - 1.5 * IQR)) & (df['ebr'] <= (Q3 + 1.5 * IQR))
+        filtered = df[mask]
         # apply filter
         filtered_mask = filtered.index
         removed_mask = set(df.index) - set(filtered_mask)
@@ -1717,12 +1727,13 @@ def _ebr_correction(self):
         df['DOY'] = df.index.dayofyear
         # datetime indices of all remaining null elements
         null_dates = df.loc[df.ebr_corr.isnull(), 'ebr_corr'].index
+
         merged = pd.merge(
             df, ebr_5day_clim, left_on='DOY', right_index=True
         )
         # assign 5 day climatology of EBR 
         merged.loc[null_dates,'ebr_corr'] =\
-            merged.loc[null_dates,'ebr_5day_clim']
+            merged.loc[null_dates,'ebr_5day_clim'].astype(float)
         # replace raw variables with unfiltered dataframe copy
         merged.LE = orig_df.LE
         merged.H = orig_df.H
@@ -1767,7 +1778,8 @@ def _ebr_correction(self):
         cols = list(set(merged.columns).difference(df.columns))
         # join calculated data in
         merged = df.join(merged[cols], how='outer')
-        merged.drop('DOY', axis=1, inplace=True)
+        # remove merge columns with suffix
+        merged = merged.drop(columns=['DOY_x','DOY_y'])
 
         self.variables.update(
             energy = 'energy',
diff --git a/fluxdataqaqc/util.py b/fluxdataqaqc/util.py
@@ -181,11 +181,11 @@ def monthly_resample(df, cols, agg_str, thresh=0.75):
         with the months daily mean before summation.
     """
     if agg_str == 'sum':
-        mdf = df.loc[:,cols].apply(pd.to_numeric).resample('M').agg(
+        mdf = df.loc[:,cols].apply(pd.to_numeric).resample('ME').agg(
             [agg_str, 'count', 'mean']
         )
     else:
-        mdf = df.loc[:,cols].apply(pd.to_numeric).resample('M').agg(
+        mdf = df.loc[:,cols].apply(pd.to_numeric).resample('ME').agg(
             [agg_str, 'count']
         )
         

Original file line number	Diff line number	Diff line change
`@@ -181,11 +181,11 @@ def monthly_resample(df, cols, agg_str, thresh=0.75):`
`181`	`181`	`with the months daily mean before summation.`
`182`	`182`	`"""`
`183`	`183`	`if agg_str == 'sum':`
`184`		`- mdf = df.loc[:,cols].apply(pd.to_numeric).resample('M').agg(`
	`184`	`+ mdf = df.loc[:,cols].apply(pd.to_numeric).resample('ME').agg(`
`185`	`185`	`[agg_str, 'count', 'mean']`
`186`	`186`	`)`
`187`	`187`	`else:`
`188`		`- mdf = df.loc[:,cols].apply(pd.to_numeric).resample('M').agg(`
	`188`	`+ mdf = df.loc[:,cols].apply(pd.to_numeric).resample('ME').agg(`
`189`	`189`	`[agg_str, 'count']`
`190`	`190`	`)`
`191`	`191`