From 2a27d4000cdaeeb4df2ecba4112102fe08f32191 Mon Sep 17 00:00:00 2001 From: Alan Gauthier Date: Fri, 24 Apr 2026 10:16:37 +0200 Subject: [PATCH 1/3] fix: handle string arrays in materialization Signed-off-by: Alan Gauthier --- sdk/python/feast/type_map.py | 52 ++++++++- sdk/python/tests/unit/test_type_map.py | 151 +++++++++++++++++++++++++ 2 files changed, 201 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index e9ccee08f25..bf67c1b8e6f 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -739,7 +739,7 @@ def _validate_collection_item_types( """ if sample is None: return - if all(type(item) in valid_types for item in sample): + if all(type(item) in valid_types for item in sample if item is not None): return # to_numpy() upcasts INT32/INT64 with NULL to Float64 automatically @@ -750,6 +750,8 @@ def _validate_collection_item_types( ValueType.INT64_SET, ] for item in sample: + if item is None: + continue # None elements in STRING_LIST are replaced with ""; for other types they are dropped if type(item) not in valid_types: if feast_value_type in int_collection_types: # Check if the float values are due to NULL upcast @@ -868,6 +870,46 @@ def convert_set_to_list(value: Any) -> Any: ] +# Sentinel value used by _to_proto_safe_list to indicate that None elements +# should simply be filtered (dropped) rather than replaced with a default. +_DROP_NONE = object() + +# Per-type default values substituted for None elements inside list columns. +# Only STRING_LIST uses ""; numeric/bytes types drop None entirely because +# there is no meaningful in-band sentinel (protobuf rejects wrong scalar types). +_LIST_TYPE_NONE_REPLACEMENT: Dict[ValueType, Any] = { + ValueType.STRING_LIST: "", +} + + +def _to_proto_safe_list( + value: Any, feast_value_type: ValueType = ValueType.STRING_LIST +) -> Any: + """Convert an array/list column value to a proto-safe Python list. + + Arrow/Athena returns Array columns as numpy.ndarray (object dtype). + Protobuf repeated fields reject ndarrays and (for non-string types) None + elements, so we: + 1. Call .tolist() to convert any numpy.ndarray to a plain Python list. + 2. For STRING_LIST: replace None elements with "" (empty string). + For all other list types: drop None elements, since there is no valid + in-band default for numeric/bytes protobuf fields. + + Args: + value: The raw column value (ndarray, list, or scalar). + feast_value_type: The Feast ValueType of the list column. Controls how + None elements are handled. Defaults to STRING_LIST. + """ + if isinstance(value, np.ndarray): + value = value.tolist() + if isinstance(value, list): + none_replacement = _LIST_TYPE_NONE_REPLACEMENT.get(feast_value_type, _DROP_NONE) + if none_replacement is _DROP_NONE: + return [x for x in value if x is not None] + return [x if x is not None else none_replacement for x in value] + return value + + def _convert_list_values_to_proto( feast_value_type: ValueType, values: List[Any], @@ -946,8 +988,14 @@ def _convert_list_values_to_proto( ] # Generic list conversion + # Arrow/Athena deserializes Array columns as numpy.ndarray (object dtype). + # _to_proto_safe_list converts to a plain Python list and sanitizes None + # elements in a type-appropriate way (replaced with "" for STRING_LIST, + # dropped for numeric/bytes types). return [ - ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore[arg-type] + ProtoValue( + **{field_name: proto_type(val=_to_proto_safe_list(value, feast_value_type))} # type: ignore[arg-type] + ) if value is not None else ProtoValue() for value in values diff --git a/sdk/python/tests/unit/test_type_map.py b/sdk/python/tests/unit/test_type_map.py index bdaea63a607..69b9aa90ee1 100644 --- a/sdk/python/tests/unit/test_type_map.py +++ b/sdk/python/tests/unit/test_type_map.py @@ -2065,3 +2065,154 @@ def test_proto_field_name_in_map(self): from feast.type_map import PROTO_VALUE_TO_VALUE_TYPE_MAP assert PROTO_VALUE_TO_VALUE_TYPE_MAP["scalar_map_val"] == ValueType.SCALAR_MAP + + +class TestArrowArrayStringListMaterialization: + """Regression tests for Array(String) columns from Arrow/Athena materialization. + + Arrow/Athena deserializes Array(String) feature columns as numpy.ndarray with + object dtype. Two bugs were triggered: + + 1. ValueError: "The truth value of an empty array is ambiguous" + — when an empty ndarray reached the scalar null-check `elif not pd.isnull(value)`. + + 2. TypeError: "bad argument type for built-in operation" + — when proto_type(val=) was called; protobuf rejects ndarrays. + + Both are fixed by _to_proto_safe_list, which converts ndarrays to plain Python + lists and sanitizes None elements in a type-appropriate way: + - STRING_LIST: None → "" (empty string) + - All other list types: None elements are dropped (filtered out) + """ + + def test_to_proto_safe_list_ndarray(self): + """ndarray is converted to a plain Python list.""" + from feast.type_map import _to_proto_safe_list + + arr = np.array(["foo", "bar"], dtype=object) + result = _to_proto_safe_list(arr) + assert result == ["foo", "bar"] + assert isinstance(result, list) + + def test_to_proto_safe_list_empty_ndarray(self): + """Empty ndarray is converted to an empty list.""" + from feast.type_map import _to_proto_safe_list + + arr = np.array([], dtype=object) + result = _to_proto_safe_list(arr) + assert result == [] + assert isinstance(result, list) + + def test_to_proto_safe_list_ndarray_with_none(self): + """None elements inside a STRING_LIST ndarray are replaced with empty string.""" + from feast.type_map import _to_proto_safe_list + + arr = np.array(["foo", None, "baz"], dtype=object) + result = _to_proto_safe_list(arr, ValueType.STRING_LIST) + assert result == ["foo", "", "baz"] + + def test_to_proto_safe_list_plain_list(self): + """Plain Python lists pass through unchanged (no None replacement needed).""" + from feast.type_map import _to_proto_safe_list + + lst = ["foo", "bar"] + result = _to_proto_safe_list(lst) + assert result == ["foo", "bar"] + + def test_to_proto_safe_list_plain_list_with_none(self): + """None elements in a STRING_LIST plain list are replaced with empty string.""" + from feast.type_map import _to_proto_safe_list + + lst = ["foo", None] + result = _to_proto_safe_list(lst, ValueType.STRING_LIST) + assert result == ["foo", ""] + + def test_to_proto_safe_list_numeric_list_none_dropped(self): + """None elements in non-string lists are dropped, not replaced with a sentinel.""" + from feast.type_map import _to_proto_safe_list + + for vt in ( + ValueType.FLOAT_LIST, + ValueType.DOUBLE_LIST, + ValueType.INT32_LIST, + ValueType.INT64_LIST, + ValueType.BYTES_LIST, + ): + result = _to_proto_safe_list([1.0, None, 2.0], vt) + assert result == [1.0, 2.0], ( + f"Expected None dropped for {vt}, got {result!r}" + ) + + def test_to_proto_safe_list_scalar_passthrough(self): + """Non-list, non-ndarray values are returned unchanged.""" + from feast.type_map import _to_proto_safe_list + + assert _to_proto_safe_list("hello") == "hello" + assert _to_proto_safe_list(None) is None + assert _to_proto_safe_list(42) == 42 + + def test_string_list_from_ndarray(self): + """STRING_LIST column with ndarray values materializes without TypeError.""" + values = [ + np.array(["foo", "bar"], dtype=object), + np.array(["baz"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert len(protos) == 2 + assert list(protos[0].string_list_val.val) == ["foo", "bar"] + assert list(protos[1].string_list_val.val) == ["baz"] + + def test_string_list_from_empty_ndarray(self): + """Empty ndarray in a STRING_LIST column must not raise ValueError.""" + values = [ + np.array([], dtype=object), + np.array(["foo"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert list(protos[0].string_list_val.val) == [] + assert list(protos[1].string_list_val.val) == ["foo"] + + def test_string_list_from_ndarray_with_none_elements(self): + """None elements inside an ndarray must not cause TypeError in protobuf.""" + values = [ + np.array(["foo", None, "baz"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + # None is replaced with empty string + assert list(protos[0].string_list_val.val) == ["foo", "", "baz"] + + def test_string_list_null_row_produces_empty_proto(self): + """A None row (missing user) produces an empty ProtoValue.""" + from feast.protos.feast.types.Value_pb2 import Value as ProtoValue + + values = [ + None, + np.array(["foo"], dtype=object), + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + assert protos[0] == ProtoValue() + assert list(protos[1].string_list_val.val) == ["foo"] + + def test_mixed_batch_simulating_athena_chunk(self): + """Simulate a real Athena chunk: mix of ndarray, empty ndarray, and None rows. + + This is the exact scenario that triggered the TypeError during + string_list_features materialization. + """ + from feast.protos.feast.types.Value_pb2 import Value as ProtoValue + + # tags / labels column from Athena + values = [ + np.array(["foo", "bar"], dtype=object), # normal entity + np.array([], dtype=object), # entity with no values set + None, # missing entity (NULL row) + np.array(["baz"], dtype=object), # normal entity + np.array(["qux", None], dtype=object), # entity with partial null + ] + protos = python_values_to_proto_values(values, ValueType.STRING_LIST) + + assert list(protos[0].string_list_val.val) == ["foo", "bar"] + assert list(protos[1].string_list_val.val) == [] + assert protos[2] == ProtoValue() + assert list(protos[3].string_list_val.val) == ["baz"] + assert list(protos[4].string_list_val.val) == ["qux", ""] From 6498775b604b4fcbee69f1d7f5efa6f5472a4977 Mon Sep 17 00:00:00 2001 From: Alan Gauthier Date: Thu, 7 May 2026 11:12:26 +0200 Subject: [PATCH 2/3] pr feedback: none default values per type Signed-off-by: Alan Gauthier --- sdk/python/feast/type_map.py | 66 +++++++++--------- sdk/python/tests/unit/test_type_map.py | 94 ++++++++++++++------------ 2 files changed, 83 insertions(+), 77 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index bf67c1b8e6f..56347ec618f 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -870,43 +870,38 @@ def convert_set_to_list(value: Any) -> Any: ] -# Sentinel value used by _to_proto_safe_list to indicate that None elements -# should simply be filtered (dropped) rather than replaced with a default. -_DROP_NONE = object() - # Per-type default values substituted for None elements inside list columns. -# Only STRING_LIST uses ""; numeric/bytes types drop None entirely because -# there is no meaningful in-band sentinel (protobuf rejects wrong scalar types). -_LIST_TYPE_NONE_REPLACEMENT: Dict[ValueType, Any] = { +# Protobuf repeated fields do not accept None, so we replace with a +# type-appropriate zero/empty value. +_LIST_NONE_DEFAULTS: Dict[ValueType, Any] = { ValueType.STRING_LIST: "", + ValueType.BYTES_LIST: b"", + ValueType.INT32_LIST: 0, + ValueType.INT64_LIST: 0, + ValueType.FLOAT_LIST: 0.0, + ValueType.DOUBLE_LIST: 0.0, + ValueType.BOOL_LIST: False, + ValueType.UNIX_TIMESTAMP_LIST: NULL_TIMESTAMP_INT_VALUE, + ValueType.UUID_LIST: "", + ValueType.TIME_UUID_LIST: "", + ValueType.DECIMAL_LIST: "", } -def _to_proto_safe_list( - value: Any, feast_value_type: ValueType = ValueType.STRING_LIST -) -> Any: - """Convert an array/list column value to a proto-safe Python list. - - Arrow/Athena returns Array columns as numpy.ndarray (object dtype). - Protobuf repeated fields reject ndarrays and (for non-string types) None - elements, so we: - 1. Call .tolist() to convert any numpy.ndarray to a plain Python list. - 2. For STRING_LIST: replace None elements with "" (empty string). - For all other list types: drop None elements, since there is no valid - in-band default for numeric/bytes protobuf fields. +def _sanitize_list_value(value: Any, feast_value_type: ValueType) -> Any: + """Convert ndarray to list and replace None elements with a type-appropriate default. - Args: - value: The raw column value (ndarray, list, or scalar). - feast_value_type: The Feast ValueType of the list column. Controls how - None elements are handled. Defaults to STRING_LIST. + Arrow/Athena may deserialize array columns as numpy.ndarray with object dtype + instead of plain Python lists. Protobuf repeated fields do not accept ndarrays + or None elements, so we normalise here before building proto messages. """ if isinstance(value, np.ndarray): value = value.tolist() - if isinstance(value, list): - none_replacement = _LIST_TYPE_NONE_REPLACEMENT.get(feast_value_type, _DROP_NONE) - if none_replacement is _DROP_NONE: - return [x for x in value if x is not None] - return [x if x is not None else none_replacement for x in value] + if isinstance(value, list) and len(value) == 0: + return None + none_default = _LIST_NONE_DEFAULTS.get(feast_value_type) + if none_default is not None and isinstance(value, list): + value = [none_default if v is None else v for v in value] return value @@ -932,6 +927,13 @@ def _convert_list_values_to_proto( feast_value_type ] + values = [ + _sanitize_list_value(v, feast_value_type) if v is not None else v + for v in values + ] + if sample is not None: + sample = _sanitize_list_value(sample, feast_value_type) + # Bytes to array type conversion if isinstance(sample, (bytes, bytearray)): if feast_value_type == ValueType.BYTES_LIST: @@ -988,14 +990,8 @@ def _convert_list_values_to_proto( ] # Generic list conversion - # Arrow/Athena deserializes Array columns as numpy.ndarray (object dtype). - # _to_proto_safe_list converts to a plain Python list and sanitizes None - # elements in a type-appropriate way (replaced with "" for STRING_LIST, - # dropped for numeric/bytes types). return [ - ProtoValue( - **{field_name: proto_type(val=_to_proto_safe_list(value, feast_value_type))} # type: ignore[arg-type] - ) + ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore[arg-type] if value is not None else ProtoValue() for value in values diff --git a/sdk/python/tests/unit/test_type_map.py b/sdk/python/tests/unit/test_type_map.py index 69b9aa90ee1..6025cdaf590 100644 --- a/sdk/python/tests/unit/test_type_map.py +++ b/sdk/python/tests/unit/test_type_map.py @@ -2079,77 +2079,87 @@ class TestArrowArrayStringListMaterialization: 2. TypeError: "bad argument type for built-in operation" — when proto_type(val=) was called; protobuf rejects ndarrays. - Both are fixed by _to_proto_safe_list, which converts ndarrays to plain Python - lists and sanitizes None elements in a type-appropriate way: - - STRING_LIST: None → "" (empty string) - - All other list types: None elements are dropped (filtered out) + Both are fixed by _sanitize_list_value, which converts ndarrays to plain Python + lists and replaces None elements with a type-appropriate zero/empty default + (see _LIST_NONE_DEFAULTS). """ - def test_to_proto_safe_list_ndarray(self): + def test_sanitize_list_value_ndarray(self): """ndarray is converted to a plain Python list.""" - from feast.type_map import _to_proto_safe_list + from feast.type_map import _sanitize_list_value arr = np.array(["foo", "bar"], dtype=object) - result = _to_proto_safe_list(arr) + result = _sanitize_list_value(arr, ValueType.STRING_LIST) assert result == ["foo", "bar"] assert isinstance(result, list) - def test_to_proto_safe_list_empty_ndarray(self): - """Empty ndarray is converted to an empty list.""" - from feast.type_map import _to_proto_safe_list + def test_sanitize_list_value_empty_ndarray(self): + """Empty ndarray is converted to None (treated as a missing row).""" + from feast.type_map import _sanitize_list_value arr = np.array([], dtype=object) - result = _to_proto_safe_list(arr) - assert result == [] - assert isinstance(result, list) + result = _sanitize_list_value(arr, ValueType.STRING_LIST) + assert result is None - def test_to_proto_safe_list_ndarray_with_none(self): + def test_sanitize_list_value_ndarray_with_none(self): """None elements inside a STRING_LIST ndarray are replaced with empty string.""" - from feast.type_map import _to_proto_safe_list + from feast.type_map import _sanitize_list_value arr = np.array(["foo", None, "baz"], dtype=object) - result = _to_proto_safe_list(arr, ValueType.STRING_LIST) + result = _sanitize_list_value(arr, ValueType.STRING_LIST) assert result == ["foo", "", "baz"] - def test_to_proto_safe_list_plain_list(self): - """Plain Python lists pass through unchanged (no None replacement needed).""" - from feast.type_map import _to_proto_safe_list + def test_sanitize_list_value_plain_list(self): + """Plain Python lists without None pass through unchanged.""" + from feast.type_map import _sanitize_list_value lst = ["foo", "bar"] - result = _to_proto_safe_list(lst) + result = _sanitize_list_value(lst, ValueType.STRING_LIST) assert result == ["foo", "bar"] - def test_to_proto_safe_list_plain_list_with_none(self): + def test_sanitize_list_value_plain_list_with_none(self): """None elements in a STRING_LIST plain list are replaced with empty string.""" - from feast.type_map import _to_proto_safe_list + from feast.type_map import _sanitize_list_value lst = ["foo", None] - result = _to_proto_safe_list(lst, ValueType.STRING_LIST) + result = _sanitize_list_value(lst, ValueType.STRING_LIST) assert result == ["foo", ""] - def test_to_proto_safe_list_numeric_list_none_dropped(self): - """None elements in non-string lists are dropped, not replaced with a sentinel.""" - from feast.type_map import _to_proto_safe_list + def test_sanitize_list_value_numeric_none_replaced(self): + """None elements in numeric lists are replaced with a type-appropriate default.""" + from feast.type_map import _sanitize_list_value - for vt in ( - ValueType.FLOAT_LIST, - ValueType.DOUBLE_LIST, - ValueType.INT32_LIST, - ValueType.INT64_LIST, - ValueType.BYTES_LIST, - ): - result = _to_proto_safe_list([1.0, None, 2.0], vt) - assert result == [1.0, 2.0], ( - f"Expected None dropped for {vt}, got {result!r}" - ) + assert _sanitize_list_value([1, None, 2], ValueType.INT32_LIST) == [1, 0, 2] + assert _sanitize_list_value([1, None, 2], ValueType.INT64_LIST) == [1, 0, 2] + assert _sanitize_list_value([1.0, None, 2.0], ValueType.FLOAT_LIST) == [ + 1.0, + 0.0, + 2.0, + ] + assert _sanitize_list_value([1.0, None, 2.0], ValueType.DOUBLE_LIST) == [ + 1.0, + 0.0, + 2.0, + ] + assert _sanitize_list_value([True, None, False], ValueType.BOOL_LIST) == [ + True, + False, + False, + ] + + def test_sanitize_list_value_bytes_none_replaced(self): + """None elements in BYTES_LIST are replaced with b''.""" + from feast.type_map import _sanitize_list_value + + result = _sanitize_list_value([b"x", None], ValueType.BYTES_LIST) + assert result == [b"x", b""] - def test_to_proto_safe_list_scalar_passthrough(self): + def test_sanitize_list_value_scalar_passthrough(self): """Non-list, non-ndarray values are returned unchanged.""" - from feast.type_map import _to_proto_safe_list + from feast.type_map import _sanitize_list_value - assert _to_proto_safe_list("hello") == "hello" - assert _to_proto_safe_list(None) is None - assert _to_proto_safe_list(42) == 42 + assert _sanitize_list_value("hello", ValueType.STRING_LIST) == "hello" + assert _sanitize_list_value(42, ValueType.INT32_LIST) == 42 def test_string_list_from_ndarray(self): """STRING_LIST column with ndarray values materializes without TypeError.""" From 066c9a229c4ee32ce5faffb943ba27cf49e1c00a Mon Sep 17 00:00:00 2001 From: Alan Gauthier Date: Tue, 26 May 2026 14:29:58 +0200 Subject: [PATCH 3/3] fix integration tests Signed-off-by: Alan Gauthier --- sdk/python/feast/type_map.py | 2 -- .../tests/integration/offline_store/test_offline_write.py | 2 ++ sdk/python/tests/unit/test_type_map.py | 7 +++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 56347ec618f..91bb56f6386 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -897,8 +897,6 @@ def _sanitize_list_value(value: Any, feast_value_type: ValueType) -> Any: """ if isinstance(value, np.ndarray): value = value.tolist() - if isinstance(value, list) and len(value) == 0: - return None none_default = _LIST_NONE_DEFAULTS.get(feast_value_type) if none_default is not None and isinstance(value, list): value = [none_default if v is None else v for v in value] diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py index df60e40ed56..6ae8b68147a 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_write.py +++ b/sdk/python/tests/integration/offline_store/test_offline_write.py @@ -135,6 +135,8 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ) store.apply([driver_entity, driver_stats]) + # Refresh registry after apply to ensure subsequent reads see the new feature view + store.refresh_registry() df = store.get_historical_features( entity_df=entity_df, features=[ diff --git a/sdk/python/tests/unit/test_type_map.py b/sdk/python/tests/unit/test_type_map.py index 6025cdaf590..8a865c72fb1 100644 --- a/sdk/python/tests/unit/test_type_map.py +++ b/sdk/python/tests/unit/test_type_map.py @@ -87,6 +87,9 @@ def test_python_values_to_proto_values_bool(values): (np.array([None]), ValueType.BYTES_LIST, None), (np.array([None]), ValueType.STRING_LIST, None), (np.array([None]), ValueType.UNIX_TIMESTAMP_LIST, None), + ([np.array([], dtype=np.int32)], ValueType.INT32_LIST, []), + ([np.array([], dtype=np.float32)], ValueType.FLOAT_LIST, []), + ([np.array([], dtype=np.bool_)], ValueType.BOOL_LIST, []), ([b"[1,2,3]"], ValueType.INT64_LIST, [1, 2, 3]), ([b"[1,2,3]"], ValueType.INT32_LIST, [1, 2, 3]), ([b"[1.5,2.5,3.5]"], ValueType.FLOAT_LIST, [1.5, 2.5, 3.5]), @@ -2094,12 +2097,12 @@ def test_sanitize_list_value_ndarray(self): assert isinstance(result, list) def test_sanitize_list_value_empty_ndarray(self): - """Empty ndarray is converted to None (treated as a missing row).""" + """Empty ndarray is converted to an empty Python list.""" from feast.type_map import _sanitize_list_value arr = np.array([], dtype=object) result = _sanitize_list_value(arr, ValueType.STRING_LIST) - assert result is None + assert result == [] def test_sanitize_list_value_ndarray_with_none(self): """None elements inside a STRING_LIST ndarray are replaced with empty string."""