From c972afc2e55d4e09abd33ef0f8388256f31f17e8 Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 12:21:30 -0400 Subject: [PATCH 1/8] Add docstrings --- src/hapiutils.py | 84 +++++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/src/hapiutils.py b/src/hapiutils.py index 67880aa..49b7f12 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -7,13 +7,14 @@ def nparray_unpack_to_list(arr) -> list: + """Converts a np.ndarray to a list.""" if type(arr) == np.ndarray: return arr.tolist() else: return arr -def merge_dtypes(dataA, dataB, trim="Time"): +def merge_dtypes(dataA, dataB, trim: str = "Time"): """could not use stackoverflow comprehensives of forms a[0],str(a[1]) because it fails on 2D items like ('Field_Vector', ' pd.DataFrame: + """Convert hapi data array to a pandas DataFrame while preserving data types. + + Args: + data (_type_): Hapi data array. + round_to_sec (bool, optional): Round time to nearest second. Defaults to False. + clean_time (bool, optional): _description_. Defaults to False. + + Returns: + pd.DataFrame: Hapi data in a pandas DataFrame. + """ # automatically 'cleans' hapitimes as well # if round_to_sec: @@ -36,79 +49,90 @@ def hapi_to_df(dataA, round_to_sec=False, clean_time=False): has_multiD = False multiD = {} - namelist = list(dataA.dtype.names) - for name in dataA.dtype.names: + namelist = list(data.dtype.names) + for name in data.dtype.names: try: - if dataA[name].shape[1]: + if data[name].shape[1]: has_multiD = True multiD[name] = True except: multiD[name] = False if has_multiD: - dfA = pd.DataFrame({"Time": dataA["Time"]}) # ,dtype='string') + df = pd.DataFrame({"Time": data["Time"]}) # ,dtype='string') namelist.remove("Time") for name in namelist: if multiD[name]: # dfA[name] = pd.Series(dtype='object') - dfA[name] = list(dataA[name]) # list or tuple work + df[name] = list(data[name]) # list or tuple work # dfA[name] = dataA[name].astype(object) # ",".join([str(val) for val in dataA[name]]) else: - dfA[name] = dataA[name] + df[name] = data[name] else: # easy case, all 1-D data so no fussing needed - dfA = pd.DataFrame(dataA) + df = pd.DataFrame(data) # clean times - dfA["Time"] = pd.to_datetime( - dfA["Time"].str.decode("utf-8") + df["Time"] = pd.to_datetime( + df["Time"].str.decode("utf-8") ) # hapitime2datetime(np.array(dfA['Time']),**ops) if round_to_sec: - dfA["Time"] = dfA["Time"].dt.round("S") - dfA["Time"] = dfA["Time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + df["Time"] = df["Time"].dt.round("S") + df["Time"] = df["Time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - return dfA + return df def merge_hapi( - dataA, metaA, dataC, metaC, round_to_sec=False, fill_nan=False, join_all=True + dataA, metaA, dataB, metaB, round_to_sec: bool = False, fill_nan: bool = False, join_all: bool = True ): - metaAC = copy.deepcopy(metaA) - for ele in metaC["parameters"]: + """Merge two hapi data arrays to a single array. Returns merged hapi data and meta objects. + + Args: + data1 (_type_): Left hapi array to merge + meta1 (_type_): Meta corresponding with data1 + data2 (_type_): Right hapi array to merge + meta2 (_type_): Meta corresponding with data2 + round_to_sec (bool, optional): Rounds time to nearest second. Defaults to False. + fill_nan (bool, optional): Fill NaNs according to fill_value from meta. Defaults to False. + join_all (bool, optional): _description_ + """ + metaAB = copy.deepcopy(metaA) + for ele in metaB["parameters"]: if ele["name"] != "Time" and (join_all or (ele not in metaA["parameters"])): # adjust both dataframe name and hapi metadata if ele in metaA["parameters"]: name = ele["name"] new_name = ( - f"{name}_{metaC['x_dataset']}" # does x_dataset always exist? + f"{name}_{metaB['x_dataset']}" # does x_dataset always exist? ) - dataC = nrecfun.rename_fields(dataC, {name: new_name}) + dataB = nrecfun.rename_fields(dataB, {name: new_name}) ele["name"] = new_name - metaAC["parameters"].append(ele) + metaAB["parameters"].append(ele) dfA = hapi_to_df(dataA, round_to_sec=round_to_sec) - dfC = hapi_to_df(dataC, round_to_sec=round_to_sec) - dt = merge_dtypes(dataA, dataC, trim="Time") - dfAC = pd.merge_ordered(dfA, dfC) # Works! + dfB = hapi_to_df(dataB, round_to_sec=round_to_sec) + dt = merge_dtypes(dataA, dataB, trim="Time") + dfAB = pd.merge_ordered(dfA, dfB) # Works! # walk through dfAC and fill all numeric 'NaN' with 'fill' from meta if fill_nan: - for ele in metaAC["parameters"]: + for ele in metaAB["parameters"]: name = ele["name"] try: if ele["fill"] != None: fill = float(ele["fill"]) print("Filling with: ", fill, ele) - dfAC[name] = dfAC[name].fillna(fill) + dfAB[name] = dfAB[name].fillna(fill) except: print("NO fill for ", ele["fill"], ele) pass - newAC = dfAC.to_records(index=False, column_dtypes={"Time": "S30"}) - newAC = np.array( - [tuple([nparray_unpack_to_list(e) for e in elm]) for elm in newAC], dtype=dt + dataAB = dfAB.to_records(index=False, column_dtypes={"Time": "S30"}) + dataAB = np.array( + [tuple([nparray_unpack_to_list(e) for e in elm]) for elm in dataAB], dtype=dt ) - newAC = np.array([tuple(i) for i in newAC], dtype=dt) + dataAB = np.array([tuple(i) for i in dataAB], dtype=dt) - return newAC, metaAC + return dataAB, metaAB From 7526b64755709cb9b80a469658e9d5e9bb2da7c3 Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 12:24:47 -0400 Subject: [PATCH 2/8] round_to_sec fn --- src/hapiutils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/hapiutils.py b/src/hapiutils.py index 49b7f12..6ec97cd 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -136,3 +136,11 @@ def merge_hapi( dataAB = np.array([tuple(i) for i in dataAB], dtype=dt) return dataAB, metaAB + + +def df_round_to_sec(df) -> pd.DataFrame: + """Rounds 'Time' column in df to nearest second and returns new DataFrame.""" + df["Time"] = pd.to_datetime(df["Time"].str.decode("utf-8")) + df["Time"] = df["Time"].dt.round("s") + df["Time"] = df["Time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + return df From a1a810f3b047b967bf8a743911df73d7e515bbc2 Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 12:37:26 -0400 Subject: [PATCH 3/8] replace df_round_to_sec() with clean_time() --- src/hapiutils.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/hapiutils.py b/src/hapiutils.py index 6ec97cd..deaa057 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -138,9 +138,20 @@ def merge_hapi( return dataAB, metaAB -def df_round_to_sec(df) -> pd.DataFrame: - """Rounds 'Time' column in df to nearest second and returns new DataFrame.""" +def clean_time(df: pd.DataFrame, round_to_sec: bool = False) -> pd.DataFrame: + """Converts time to hapi specfied format and optionally rounds time to nearest second. + + Args: + df (pd.DataFrame): DataFrame containing "Time" column to clean. + round_to_sec (bool, optional): Rounds time to nearest second. Defaults to False. + + Returns: + pd.DataFrame: DataFrame with "Time" column cleaned. + """ df["Time"] = pd.to_datetime(df["Time"].str.decode("utf-8")) - df["Time"] = df["Time"].dt.round("s") + if round_to_sec: + df["Time"] = df["Time"].dt.round("s") df["Time"] = df["Time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") return df + + From b9b2db0923136f85588b1ea76281f567cd51d1cc Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 12:46:30 -0400 Subject: [PATCH 4/8] refined hapi_to_df() --- src/hapiutils.py | 58 +++++++++++++----------------------------------- 1 file changed, 15 insertions(+), 43 deletions(-) diff --git a/src/hapiutils.py b/src/hapiutils.py index deaa057..c53897e 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -29,58 +29,30 @@ def merge_dtypes(dataA, dataB, trim: str = "Time"): return a -def hapi_to_df( - data, round_to_sec: bool = False, clean_time: bool = False -) -> pd.DataFrame: - """Convert hapi data array to a pandas DataFrame while preserving data types. +def hapi_to_df(data, round_to_sec: bool = False) -> pd.DataFrame: + """Convert hapi data array to a pandas DataFrame, preserving data types. Args: data (_type_): Hapi data array. round_to_sec (bool, optional): Round time to nearest second. Defaults to False. - clean_time (bool, optional): _description_. Defaults to False. Returns: pd.DataFrame: Hapi data in a pandas DataFrame. """ - # automatically 'cleans' hapitimes as well - - # if round_to_sec: - # dataA['Time'] = round_hapitime(dataA['Time']) - - has_multiD = False - multiD = {} - namelist = list(data.dtype.names) + data_dict = {} for name in data.dtype.names: - try: - if data[name].shape[1]: - has_multiD = True - multiD[name] = True - except: - multiD[name] = False - - if has_multiD: - df = pd.DataFrame({"Time": data["Time"]}) # ,dtype='string') - namelist.remove("Time") - for name in namelist: - if multiD[name]: - # dfA[name] = pd.Series(dtype='object') - df[name] = list(data[name]) # list or tuple work - # dfA[name] = dataA[name].astype(object) - # ",".join([str(val) for val in dataA[name]]) - else: - df[name] = data[name] - else: - # easy case, all 1-D data so no fussing needed - df = pd.DataFrame(data) - - # clean times - df["Time"] = pd.to_datetime( - df["Time"].str.decode("utf-8") - ) # hapitime2datetime(np.array(dfA['Time']),**ops) - if round_to_sec: - df["Time"] = df["Time"].dt.round("S") - df["Time"] = df["Time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - + column_data = data[name] + + # Check if the field's dtype includes a subarray shape (multi-dimensional) + if len(column_data.shape) > 1: + # Convert subarray fields into lists or tuples + data_dict[name] = [element for element in column_data] + else: + # Directly assign the data for 1d dtypes + data_dict[name] = column_data + + df = pd.DataFrame(data_dict) + df = clean_time(df, round_to_sec=round_to_sec) return df From 6f55f7cdb11bd92f1469cdba4db5f735e808e531 Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 13:06:05 -0400 Subject: [PATCH 5/8] add helper fns & diff merge methods to merge_hapi() --- src/hapiutils.py | 103 ++++++++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/src/hapiutils.py b/src/hapiutils.py index c53897e..36fb18a 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -4,6 +4,8 @@ import pandas as pd import copy from datetime import datetime, timedelta +from typing import Literal +from hapiclient.hapi import compute_dt def nparray_unpack_to_list(arr) -> list: @@ -57,55 +59,41 @@ def hapi_to_df(data, round_to_sec: bool = False) -> pd.DataFrame: def merge_hapi( - dataA, metaA, dataB, metaB, round_to_sec: bool = False, fill_nan: bool = False, join_all: bool = True -): - """Merge two hapi data arrays to a single array. Returns merged hapi data and meta objects. + dataA, metaA, dataB, metaB, how: Literal["left", "right", "outer", "inner"] = "outer", round_to_sec: bool = False, fill_nan: bool = False): + """Merge two hapi data arrays to single array via specified merge type. Returns merged hapi data and meta objects. Args: - data1 (_type_): Left hapi array to merge - meta1 (_type_): Meta corresponding with data1 - data2 (_type_): Right hapi array to merge - meta2 (_type_): Meta corresponding with data2 + dataA: Left hapi array to merge + metaA: Meta corresponding with dataA + dataB: Right hapi array to merge + metaB: Meta corresponding with dataB + how (str, optional): Type of merge: 'left', 'right', 'outer', or 'inner'. See documentation for pandas.merge_ordered for descriptions. Defaults to 'outer'. round_to_sec (bool, optional): Rounds time to nearest second. Defaults to False. fill_nan (bool, optional): Fill NaNs according to fill_value from meta. Defaults to False. - join_all (bool, optional): _description_ """ metaAB = copy.deepcopy(metaA) - for ele in metaB["parameters"]: - if ele["name"] != "Time" and (join_all or (ele not in metaA["parameters"])): - # adjust both dataframe name and hapi metadata - if ele in metaA["parameters"]: - name = ele["name"] - new_name = ( - f"{name}_{metaB['x_dataset']}" # does x_dataset always exist? - ) - dataB = nrecfun.rename_fields(dataB, {name: new_name}) - ele["name"] = new_name - metaAB["parameters"].append(ele) - + new_names = {} + for param in metaB["parameters"]: + if param["name"] != "Time": + # If the field is already in the left array, change the field name + if param in metaA["parameters"]: + new_name = f"{param['name']}_{metaB['x_dataset']}" # TODO: does x_dataset always exist? + new_names[param["name"]] = new_name + param["name"] = new_name + # Update meta + metaAB["parameters"].append(param) + + dataB = nrecfun.rename_fields(dataB, new_names) + + # Convert structured arrays to DataFrames and merge on "Time" fields dfA = hapi_to_df(dataA, round_to_sec=round_to_sec) dfB = hapi_to_df(dataB, round_to_sec=round_to_sec) - dt = merge_dtypes(dataA, dataB, trim="Time") - dfAB = pd.merge_ordered(dfA, dfB) # Works! + dfAB = pd.merge_ordered(dfA, dfB, on="Time", how=how) - # walk through dfAC and fill all numeric 'NaN' with 'fill' from meta if fill_nan: - for ele in metaAB["parameters"]: - name = ele["name"] - try: - if ele["fill"] != None: - fill = float(ele["fill"]) - print("Filling with: ", fill, ele) - dfAB[name] = dfAB[name].fillna(fill) - except: - print("NO fill for ", ele["fill"], ele) - pass + dfAB = df_fill_nans(dfAB, metaAB) - dataAB = dfAB.to_records(index=False, column_dtypes={"Time": "S30"}) - dataAB = np.array( - [tuple([nparray_unpack_to_list(e) for e in elm]) for elm in dataAB], dtype=dt - ) - dataAB = np.array([tuple(i) for i in dataAB], dtype=dt) + dataAB = df_to_hapi(dfAB, metaAB) return dataAB, metaAB @@ -127,3 +115,42 @@ def clean_time(df: pd.DataFrame, round_to_sec: bool = False) -> pd.DataFrame: return df +def df_fill_nans(df, meta) -> pd.DataFrame: + """Returns new hapi DataFrame with all NaN values filled according to fill_value in meta.""" + for param in meta["parameters"]: + name = param["name"] + if param["fill"] is None: + print(f"No fill: {param['name']} --> {param['fill']}") + else: + try: + fill = float(param["fill"]) + df[name] = df[name].fillna(fill) + print(f"Fill successful: {param['name']} --> {param['fill']}") + except: + print(f"Fill failed: {param['name']} -> {param['fill']}") + pass + return df + + +def dtypes_from_data(data) -> list: + """Returns parameter data types from hapi data array.""" + dt = [(param, data.dtype.fields[param][0]) for param in data.dtype.names] + return dt + + +def dtypes_from_meta(meta) -> list: + """Returns parameter data types from meta.""" + dt, _, _, _, _, _ = compute_dt(meta, {"format": ""}) + return dt + + +def df_to_hapi(df, meta): + """Converts a hapi DataFrame to a hapi array.""" + dt = dtypes_from_meta(meta) + data = df.to_records(index=False, column_dtypes={"Time": "S30"}) + data = np.array( + [tuple([nparray_unpack_to_list(e) for e in elm]) for elm in data], dtype=dt + ) + data = np.array([tuple(i) for i in data], dtype=dt) + return data + From 56b33b1b1614c290616a05f840fec26d0c9ee184 Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 13:28:17 -0400 Subject: [PATCH 6/8] add resample functionality --- src/hapiutils.py | 86 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/src/hapiutils.py b/src/hapiutils.py index 36fb18a..9712bf2 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -1,11 +1,11 @@ -from hapiclient.hapitime import hapitime2datetime, datetime2hapitime -import numpy.lib.recfunctions as nrecfun -import numpy as np -import pandas as pd import copy -from datetime import datetime, timedelta from typing import Literal + +import numpy as np +import numpy.lib.recfunctions as nrecfun +import pandas as pd from hapiclient.hapi import compute_dt +from hapiclient.hapitime import hapitime2datetime, datetime2hapitime def nparray_unpack_to_list(arr) -> list: @@ -59,17 +59,27 @@ def hapi_to_df(data, round_to_sec: bool = False) -> pd.DataFrame: def merge_hapi( - dataA, metaA, dataB, metaB, how: Literal["left", "right", "outer", "inner"] = "outer", round_to_sec: bool = False, fill_nan: bool = False): - """Merge two hapi data arrays to single array via specified merge type. Returns merged hapi data and meta objects. + dataA, + metaA, + dataB, + metaB, + how: Literal["left", "right", "outer", "inner"] = "outer", + round_to_sec: bool = False, + fill_nan: bool = False, +): + """Merge two hapi data arrays to single array via specified merge type. Returns + merged hapi data and meta objects. Args: dataA: Left hapi array to merge metaA: Meta corresponding with dataA dataB: Right hapi array to merge metaB: Meta corresponding with dataB - how (str, optional): Type of merge: 'left', 'right', 'outer', or 'inner'. See documentation for pandas.merge_ordered for descriptions. Defaults to 'outer'. + how (str, optional): Type of merge: 'left', 'right', 'outer', or 'inner'. See + documentation for pandas.merge_ordered for descriptions. Defaults to 'outer'. round_to_sec (bool, optional): Rounds time to nearest second. Defaults to False. - fill_nan (bool, optional): Fill NaNs according to fill_value from meta. Defaults to False. + fill_nan (bool, optional): Fill NaNs according to fill_value from meta. Defaults + to False. """ metaAB = copy.deepcopy(metaA) new_names = {} @@ -77,7 +87,7 @@ def merge_hapi( if param["name"] != "Time": # If the field is already in the left array, change the field name if param in metaA["parameters"]: - new_name = f"{param['name']}_{metaB['x_dataset']}" # TODO: does x_dataset always exist? + new_name = f"{param['name']}_{metaB['x_dataset']}" # Does x_dataset always exist? new_names[param["name"]] = new_name param["name"] = new_name # Update meta @@ -154,3 +164,59 @@ def df_to_hapi(df, meta): data = np.array([tuple(i) for i in data], dtype=dt) return data + +def resample_hapi( + data, + meta, + interval: str, + round_to_sec: bool = False, + tolerance: float | None = None, + start_time: str | None = None, + end_time: str | None = None, + limit: int | None = None, +): + """ + Resample hapi data at specified intervals. If an exact time does not exist, + substitutes with nearest time. + """ + df = hapi_to_df(data, round_to_sec=round_to_sec) + + # Format dataframe for pandas resampling + # df["Time"] = pd.to_datetime(df["Time"]) + df["Time"] = hapitime2datetime(df["Time"].values) + df = df.set_index("Time") + + if start_time is not None: + start_time = pd.to_datetime(start_time) + df = df[df.index >= start_time] + + if end_time is not None: + end_time = pd.to_datetime(end_time) + df = df[df.index <= end_time] + + if tolerance: + target_times = pd.date_range( + start=df.index.min(), end=df.index.max(), freq=interval + ) + tolerance_timedelta = pd.to_timedelta(tolerance) + + # Iterate over each target time and find the closest time within the tolerance + sampled_rows = [] + for target_time in target_times: + time_diffs = abs((df.index - target_time).total_seconds()) + closest_idx = time_diffs.idxmin() + if time_diffs[closest_idx] <= tolerance_timedelta: + sampled_rows.append(df.loc[closest_idx]) + + resampled_df = pd.DataFrame(sampled_rows).reset_index() + + else: + # Resample the DataFrame using the nearest value + resampled_df = df.resample(interval, origin="start").nearest(limit=limit) + resampled_df = resampled_df.reset_index() + + resampled_meta = copy.deepcopy(meta) # Does meta need to be updated? + resampled_df["Time"] = resampled_df["Time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + # resampled_df["Time"] = datetime2hapitime(resampled_df["Time"].values) + resampled_data = df_to_hapi(resampled_df, resampled_meta) + return resampled_data, resampled_meta # , resampled_df From dd4e06786d1b1b42a8f8f1340bec29fd8d2be63f Mon Sep 17 00:00:00 2001 From: tinsmcl1 Date: Thu, 3 Oct 2024 14:44:51 -0400 Subject: [PATCH 7/8] docstring --- src/hapiutils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hapiutils.py b/src/hapiutils.py index 9712bf2..0214e42 100644 --- a/src/hapiutils.py +++ b/src/hapiutils.py @@ -176,8 +176,8 @@ def resample_hapi( limit: int | None = None, ): """ - Resample hapi data at specified intervals. If an exact time does not exist, - substitutes with nearest time. + Resample hapi data at specified intervals. If data for a time does not exist, + uses data value from the nearest time. """ df = hapi_to_df(data, round_to_sec=round_to_sec) From e8686b53e9bf1d384eae5234375c7a441e548af6 Mon Sep 17 00:00:00 2001 From: antunak1 Date: Thu, 3 Oct 2024 15:11:56 -0400 Subject: [PATCH 8/8] git testing --- tests/test2.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/test2.py diff --git a/tests/test2.py b/tests/test2.py new file mode 100644 index 0000000..4b6ca5b --- /dev/null +++ b/tests/test2.py @@ -0,0 +1 @@ +# stub to test git