From 713768e036443ecf404437685a18ee354bd087fd Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 31 Mar 2024 10:45:02 -0700 Subject: [PATCH 01/40] feat: add document store --- sdk/python/feast/feature_store.py | 78 ++++++++++ .../infra/online_stores/contrib/postgres.py | 123 ++++++++++++--- .../infra/online_stores/document_store.py | 38 +++++ .../feast/infra/online_stores/helpers.py | 10 ++ .../feast/infra/passthrough_provider.py | 28 +++- sdk/python/feast/infra/provider.py | 141 +++++++++++------- sdk/python/feast/repo_config.py | 6 + 7 files changed, 341 insertions(+), 83 deletions(-) create mode 100644 sdk/python/feast/infra/online_stores/document_store.py diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index bfb8a59b2bb..b245957b98d 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -35,6 +35,7 @@ import pandas as pd import pyarrow as pa +import numpy as np from colorama import Fore, Style from google.protobuf.timestamp_pb2 import Timestamp from tqdm import tqdm @@ -1684,6 +1685,62 @@ def _get_online_features( ) return OnlineResponse(online_features_response) + @log_exceptions_and_usage + def get_top_k_document_features(self, + feature: Union[str, FeatureService], + document: Union[str, np.ndarray], + top_k: int, + ) -> OnlineResponse: + """ + Retrieves the top k cloeses document features. + + Args: + feature: The list of document features that should be retrieved from the online document store. These features can be + specified either as a list of string document feature references or as a feature service. String feature + references must have format "feature_view:feature", e.g, "document_fv:document_embedding_feature". + document: The document to retrieve the closest document features for. + top_k: The number of closest document features to retrieve. + """ + return self._get_top_k_document_features( + feature=feature, + document=document, + top_k=top_k, + ) + + def _get_top_k_document_features( + self, + feature: Union[str, FeatureService], + document: Union[str, np.ndarray], + top_k: int, + ): + ( + requested_feature_views, + requested_on_demand_feature_views + ) = self._get_feature_views_to_use( + features=[feature], + allow_cache=True, + hide_dummy_entity=False + ) + requested_feature = feature.split(":")[1] if isinstance(feature, str) else feature + provider = self._get_provider() + document_features = self._search_from_document_store( + provider, + requested_feature_views[0], + requested_feature, + document, + top_k, + ) + online_features_response = GetOnlineFeaturesResponse(results=[]) + self._populate_response_from_feature_data( + document_features, + [], + online_features_response, + False, + requested_feature, + requested_feature_views[0] + ) + return OnlineResponse(online_features_response) + @staticmethod def _get_columnar_entity_values( rowise: Optional[List[Dict[str, Any]]], columnar: Optional[Dict[str, List[Any]]] @@ -1900,6 +1957,27 @@ def _read_from_online_store( read_row_protos.append((event_timestamps, statuses, values)) return read_row_protos + def _search_from_document_store( + self, + provider: Provider, + table: FeatureView, + requested_feature: str, + document: Union[str, np.ndarray], + top_k: int, + ) -> List[Tuple[List[Timestamp], List["FieldStatus.ValueType"], List[Value]]]: + """ + Search and return document features from the online document store. + """ + documents = provider.online_search( + config=self.config, + table=table, + requested_feature=requested_feature, + document=document, + top_k=top_k, + ) + return documents + + @staticmethod def _populate_response_from_feature_data( feature_data: Iterable[ diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 308528aaec2..a54edd889ae 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -4,6 +4,7 @@ from datetime import datetime from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple +import numpy as np import psycopg2 import pytz from psycopg2 import sql @@ -12,8 +13,10 @@ from feast import Entity from feast.feature_view import FeatureView +from feast.feature import Feature from feast.infra.key_encoding_utils import serialize_entity_key from feast.infra.online_stores.online_store import OnlineStore +from feast.infra.online_stores.document_store import DocumentStore, DocumentStoreIndexConfig from feast.infra.utils.postgres.connection_utils import _get_conn, _get_connection_pool from feast.infra.utils.postgres.postgres_config import ConnectionType, PostgreSQLConfig from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto @@ -46,13 +49,13 @@ def _get_conn(self, config: RepoConfig): @log_exceptions_and_usage(online_store="postgres") def online_write_batch( - self, - config: RepoConfig, - table: FeatureView, - data: List[ - Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] - ], - progress: Optional[Callable[[int], Any]], + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], ) -> None: project = config.project @@ -80,7 +83,7 @@ def online_write_batch( # Control the batch so that we can update the progress batch_size = 5000 for i in range(0, len(insert_values), batch_size): - cur_batch = insert_values[i : i + batch_size] + cur_batch = insert_values[i: i + batch_size] execute_values( cur, sql.SQL( @@ -104,11 +107,11 @@ def online_write_batch( @log_exceptions_and_usage(online_store="postgres") def online_read( - self, - config: RepoConfig, - table: FeatureView, - entity_keys: List[EntityKeyProto], - requested_features: Optional[List[str]] = None, + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] @@ -175,13 +178,13 @@ def online_read( @log_exceptions_and_usage(online_store="postgres") def update( - self, - config: RepoConfig, - tables_to_delete: Sequence[FeatureView], - tables_to_keep: Sequence[FeatureView], - entities_to_delete: Sequence[Entity], - entities_to_keep: Sequence[Entity], - partial: bool, + self, + config: RepoConfig, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, ): project = config.project schema_name = config.online_store.db_schema or config.online_store.user @@ -236,10 +239,10 @@ def update( conn.commit() def teardown( - self, - config: RepoConfig, - tables: Sequence[FeatureView], - entities: Sequence[Entity], + self, + config: RepoConfig, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): project = config.project try: @@ -273,3 +276,75 @@ def _to_naive_utc(ts: datetime): return ts else: return ts.astimezone(pytz.utc).replace(tzinfo=None) + + +# Search query template to find the top k items that are closest to the given embedding +# SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; +SEARCH_QUERY_TEMPLATE = """ +SELECT entity_key, feature_name, value, event_ts FROM {table_name} +WHERE feature_name = '{feature_name}' +ORDER BY value <-> %s +LIMIT %s; +""" + +# Create index query template to create a index based on the index type +CREATE_INDEX_QUERY_TEMPLATE = """ +CREATE INDEX ON {table_name} USING {index_type} (embedding {embeding_type}); +""" + + +class PostgresDocumentStoreConfig(DocumentStoreIndexConfig): + type: Literal["postgres"] = "postgres" + + +class PostgresDocumentStore(PostgreSQLOnlineStore, DocumentStore): + + def online_search(self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embedding: np.ndarray, + top_k: int, + ): + result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] + + with self._get_conn(config) as conn, conn.cursor() as cur: + cur.execute(SEARCH_QUERY_TEMPLATE.format( + table_name=table, + feature_name=requested_feature + ), (embedding, top_k)) + rows = cur.fetchall() + + for row in rows: + # The first column is the entity key + entity_key = EntityKeyProto() + entity_key.ParseFromString(row[0]) + + # The second column is the feature name + feature_name = row[1] + + # The third column is the embedding value + val = ValueProto() + val.ParseFromString(row[2]) + + # The fourth column is the event timestamp + event_ts = row[3] + + res = {} + res[feature_name] = val + result.append((event_ts, res)) + + + return result + + def create_index(self, + config: RepoConfig, + index: str, + index_config: DocumentStoreIndexConfig + ): + with self._get_conn(config) as conn, conn.cursor() as cur: + cur.execute(CREATE_INDEX_QUERY_TEMPLATE.format( + table_name=config.project, + index_type=index, + embeding_type=index_config.embedding_type + )) diff --git a/sdk/python/feast/infra/online_stores/document_store.py b/sdk/python/feast/infra/online_stores/document_store.py new file mode 100644 index 00000000000..f0b64c853b3 --- /dev/null +++ b/sdk/python/feast/infra/online_stores/document_store.py @@ -0,0 +1,38 @@ +from abc import abstractmethod +from datetime import datetime +from feast.feature_view import FeatureView +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.repo_config import RepoConfig, FeastConfigBaseModel +from infra.online_stores.online_store import OnlineStore +from typing import Optional, List, Tuple, Dict +import numpy as np + + +class DocumentStoreIndexConfig(FeastConfigBaseModel): + embedding_type: Optional[str] + + +class DocumentStore(OnlineStore): + index: Optional[str] + + @abstractmethod + def online_search(self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embeddings: np.ndarray, + top_k: int, + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + raise NotImplementedError( + "You have to implement this!" + ) + + @abstractmethod + def create_index(self, + config: RepoConfig, + index: str, + index_config: DocumentStoreIndexConfig + ): + raise NotImplementedError( + "You have to implement this!" + ) diff --git a/sdk/python/feast/infra/online_stores/helpers.py b/sdk/python/feast/infra/online_stores/helpers.py index 0e2fdb35007..73964c52ff0 100644 --- a/sdk/python/feast/infra/online_stores/helpers.py +++ b/sdk/python/feast/infra/online_stores/helpers.py @@ -9,6 +9,7 @@ serialize_entity_key_prefix, ) from feast.infra.online_stores.online_store import OnlineStore +from feast.infra.online_stores.document_store import DocumentStore from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto @@ -21,6 +22,15 @@ def get_online_store_from_config(online_store_config: Any) -> OnlineStore: return online_store_class() +def get_document_store_from_config(document_store_config: Any) -> DocumentStore: + """Creates a document store corresponding to the given online document store config.""" + module_name = document_store_config.__module__ + qualified_name = type(document_store_config).__name__ + class_name = qualified_name.replace("Config", "") + document_store_class = import_class(module_name, class_name, "DocumentStore") + return document_store_class() + + def _redis_key( project: str, entity_key: EntityKeyProto, entity_key_serialization_version=1 ) -> bytes: diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index aca18f4856b..d0b0819bc41 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -1,6 +1,7 @@ from datetime import datetime, timedelta from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +import numpy as np import pandas as pd import pyarrow as pa from tqdm import tqdm @@ -18,7 +19,7 @@ ) from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.offline_stores.offline_utils import get_offline_store_from_config -from feast.infra.online_stores.helpers import get_online_store_from_config +from feast.infra.online_stores.helpers import get_online_store_from_config, get_document_store_from_config from feast.infra.provider import Provider from feast.infra.registry.base_registry import BaseRegistry from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto @@ -47,6 +48,7 @@ def __init__(self, config: RepoConfig): self.repo_config = config self._offline_store = None self._online_store = None + self._document_store = None self._batch_engine: Optional[BatchMaterializationEngine] = None @property @@ -56,6 +58,14 @@ def online_store(self): self.repo_config.online_store ) return self._online_store + + @property + def document_store(self): + if not self._document_store: + self._document_store = get_document_store_from_config( + self.repo_config.online_store + ) + return self._document_store @property def offline_store(self): @@ -190,6 +200,22 @@ def online_read( ) return result + @log_exceptions_and_usage(sampler=RatioSampler(ratio=0.001)) + def online_search( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embeddings: np.ndarray, + top_k: int + ) -> List: + set_usage_attribute("provider", self.__class__.__name__) + result = [] + if self.document_store: + result = self.document_store.online_search(config, table, requested_feature, embeddings, top_k) + return result + + def ingest_df( self, feature_view: FeatureView, diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 2a9670cacef..2f04d86fbee 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import pandas as pd +import numpy as np import pyarrow from tqdm import tqdm @@ -41,13 +42,13 @@ def __init__(self, config: RepoConfig): @abstractmethod def update_infra( - self, - project: str, - tables_to_delete: Sequence[FeatureView], - tables_to_keep: Sequence[FeatureView], - entities_to_delete: Sequence[Entity], - entities_to_keep: Sequence[Entity], - partial: bool, + self, + project: str, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, ): """ Reconciles cloud resources with the specified set of Feast objects. @@ -66,7 +67,7 @@ def update_infra( pass def plan_infra( - self, config: RepoConfig, desired_registry_proto: RegistryProto + self, config: RepoConfig, desired_registry_proto: RegistryProto ) -> Infra: """ Returns the Infra required to support the desired registry. @@ -79,10 +80,10 @@ def plan_infra( @abstractmethod def teardown_infra( - self, - project: str, - tables: Sequence[FeatureView], - entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): """ Tears down all cloud resources for the specified set of Feast objects. @@ -96,13 +97,13 @@ def teardown_infra( @abstractmethod def online_write_batch( - self, - config: RepoConfig, - table: FeatureView, - data: List[ - Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] - ], - progress: Optional[Callable[[int], Any]], + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], ) -> None: """ Writes a batch of feature rows to the online store. @@ -121,9 +122,9 @@ def online_write_batch( pass def ingest_df( - self, - feature_view: FeatureView, - df: pd.DataFrame, + self, + feature_view: FeatureView, + df: pd.DataFrame, ): """ Persists a dataframe to the online store. @@ -135,9 +136,9 @@ def ingest_df( pass def ingest_df_to_offline_store( - self, - feature_view: FeatureView, - df: pyarrow.Table, + self, + feature_view: FeatureView, + df: pyarrow.Table, ): """ Persists a dataframe to the offline store. @@ -150,14 +151,14 @@ def ingest_df_to_offline_store( @abstractmethod def materialize_single_feature_view( - self, - config: RepoConfig, - feature_view: FeatureView, - start_date: datetime, - end_date: datetime, - registry: BaseRegistry, - project: str, - tqdm_builder: Callable[[int], tqdm], + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: BaseRegistry, + project: str, + tqdm_builder: Callable[[int], tqdm], ) -> None: """ Writes latest feature values in the specified time range to the online store. @@ -175,14 +176,14 @@ def materialize_single_feature_view( @abstractmethod def get_historical_features( - self, - config: RepoConfig, - feature_views: List[FeatureView], - feature_refs: List[str], - entity_df: Union[pd.DataFrame, str], - registry: BaseRegistry, - project: str, - full_feature_names: bool, + self, + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool, ) -> RetrievalJob: """ Retrieves the point-in-time correct historical feature values for the specified entity rows. @@ -207,11 +208,11 @@ def get_historical_features( @abstractmethod def online_read( - self, - config: RepoConfig, - table: FeatureView, - entity_keys: List[EntityKeyProto], - requested_features: Optional[List[str]] = None, + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: """ Reads features values for the given entity keys. @@ -231,7 +232,7 @@ def online_read( @abstractmethod def retrieve_saved_dataset( - self, config: RepoConfig, dataset: SavedDataset + self, config: RepoConfig, dataset: SavedDataset ) -> RetrievalJob: """ Reads a saved dataset. @@ -247,11 +248,11 @@ def retrieve_saved_dataset( @abstractmethod def write_feature_service_logs( - self, - feature_service: FeatureService, - logs: Union[pyarrow.Table, Path], - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + logs: Union[pyarrow.Table, Path], + config: RepoConfig, + registry: BaseRegistry, ): """ Writes features and entities logged by a feature server to the offline store. @@ -269,12 +270,12 @@ def write_feature_service_logs( @abstractmethod def retrieve_feature_service_logs( - self, - feature_service: FeatureService, - start_date: datetime, - end_date: datetime, - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + start_date: datetime, + end_date: datetime, + config: RepoConfig, + registry: BaseRegistry, ) -> RetrievalJob: """ Reads logged features for the specified time window. @@ -295,6 +296,30 @@ def get_feature_server_endpoint(self) -> Optional[str]: """Returns endpoint for the feature server, if it exists.""" return None + @abstractmethod + def online_search( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + document: Union[str, np.ndarray], + top_k: int, + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + """ + Searches for the top-k nearest neighbors of the given document in the online document store. + + Args: + config: The config for the current feature store. + table: The feature view whose embeddings should be searched. + requested_feature: the requested document feature name. + document: The document to search for. + top_k: The number of nearest neighbors to return. + + Returns: + A list of dictionaries, where each dictionary contains the document feature. + """ + pass + def get_provider(config: RepoConfig) -> Provider: if "." not in config.provider: diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 5708754622b..15c711567f2 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -63,6 +63,9 @@ "mysql": "feast.infra.online_stores.contrib.mysql_online_store.mysql.MySQLOnlineStore", "rockset": "feast.infra.online_stores.contrib.rockset_online_store.rockset.RocksetOnlineStore", "hazelcast": "feast.infra.online_stores.contrib.hazelcast_online_store.hazelcast_online_store.HazelcastOnlineStore", + + # below are supported Online Document Store + "postgresDocument": "feast.infra.online_stores.contrib.postgres.PostgresDocumentStore", } OFFLINE_STORE_CLASS_FOR_TYPE = { @@ -181,6 +184,9 @@ class RepoConfig(FeastBaseModel): coerce_tz_aware: Optional[bool] = True """ If True, coerces entity_df timestamp columns to be timezone aware (to UTC by default). """ + document_store_config: Any = Field(None, alias="document_store") + """ DocumentStoreConfig: Document store configuration (optional depending on provider) """ + def __init__(self, **data: Any): super().__init__(**data) From 58d5d948e0d3435c2c882f441081f5ae2cd41707 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 31 Mar 2024 10:55:45 -0700 Subject: [PATCH 02/40] feat: add document store --- sdk/python/feast/feature_store.py | 56 +++++--- .../infra/online_stores/contrib/postgres.py | 101 +++++++------- .../infra/online_stores/document_store.py | 39 +++--- .../feast/infra/online_stores/helpers.py | 2 +- .../feast/infra/passthrough_provider.py | 14 +- sdk/python/feast/infra/provider.py | 130 +++++++++--------- sdk/python/feast/repo_config.py | 1 - 7 files changed, 184 insertions(+), 159 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index b245957b98d..732f609350f 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -33,9 +33,9 @@ cast, ) +import numpy as np import pandas as pd import pyarrow as pa -import numpy as np from colorama import Fore, Style from google.protobuf.timestamp_pb2 import Timestamp from tqdm import tqdm @@ -1686,11 +1686,12 @@ def _get_online_features( return OnlineResponse(online_features_response) @log_exceptions_and_usage - def get_top_k_document_features(self, - feature: Union[str, FeatureService], - document: Union[str, np.ndarray], - top_k: int, - ) -> OnlineResponse: + def get_top_k_document_features( + self, + feature: str, + document: Union[str, np.ndarray], + top_k: int, + ) -> OnlineResponse: """ Retrieves the top k cloeses document features. @@ -1708,20 +1709,20 @@ def get_top_k_document_features(self, ) def _get_top_k_document_features( - self, - feature: Union[str, FeatureService], - document: Union[str, np.ndarray], - top_k: int, + self, + feature: str, + document: Union[str, np.ndarray], + top_k: int, ): ( requested_feature_views, - requested_on_demand_feature_views + requested_on_demand_feature_views, ) = self._get_feature_views_to_use( - features=[feature], - allow_cache=True, - hide_dummy_entity=False + features=[feature], allow_cache=True, hide_dummy_entity=False + ) + requested_feature = ( + feature.split(":")[1] if isinstance(feature, str) else feature ) - requested_feature = feature.split(":")[1] if isinstance(feature, str) else feature provider = self._get_provider() document_features = self._search_from_document_store( provider, @@ -1737,7 +1738,7 @@ def _get_top_k_document_features( online_features_response, False, requested_feature, - requested_feature_views[0] + requested_feature_views[0], ) return OnlineResponse(online_features_response) @@ -1975,8 +1976,29 @@ def _search_from_document_store( document=document, top_k=top_k, ) - return documents + # Each row is a set of features for a given entity key. We only need to convert + # the data to Protobuf once. + null_value = Value() + read_row_protos = [] + for doc in documents: + row_ts_proto = Timestamp() + row_ts, feature_data = doc + # TODO (Ly): reuse whatever timestamp if row_ts is None? + if row_ts is not None: + row_ts_proto.FromDatetime(row_ts) + event_timestamps = [row_ts_proto] + if feature_data is None: + statuses = [FieldStatus.NOT_FOUND] + values = [null_value] + else: + statuses = [] + values = [] + for feature_name, feature_value in feature_data.items(): + statuses.append(FieldStatus.PRESENT) + values.append(feature_value) + read_row_protos.append((event_timestamps, statuses, values)) + return read_row_protos @staticmethod def _populate_response_from_feature_data( diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index a54edd889ae..976ce334059 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -13,10 +13,12 @@ from feast import Entity from feast.feature_view import FeatureView -from feast.feature import Feature from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.online_stores.document_store import ( + DocumentStore, + DocumentStoreIndexConfig, +) from feast.infra.online_stores.online_store import OnlineStore -from feast.infra.online_stores.document_store import DocumentStore, DocumentStoreIndexConfig from feast.infra.utils.postgres.connection_utils import _get_conn, _get_connection_pool from feast.infra.utils.postgres.postgres_config import ConnectionType, PostgreSQLConfig from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto @@ -49,13 +51,13 @@ def _get_conn(self, config: RepoConfig): @log_exceptions_and_usage(online_store="postgres") def online_write_batch( - self, - config: RepoConfig, - table: FeatureView, - data: List[ - Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] - ], - progress: Optional[Callable[[int], Any]], + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], ) -> None: project = config.project @@ -83,7 +85,7 @@ def online_write_batch( # Control the batch so that we can update the progress batch_size = 5000 for i in range(0, len(insert_values), batch_size): - cur_batch = insert_values[i: i + batch_size] + cur_batch = insert_values[i : i + batch_size] execute_values( cur, sql.SQL( @@ -107,11 +109,11 @@ def online_write_batch( @log_exceptions_and_usage(online_store="postgres") def online_read( - self, - config: RepoConfig, - table: FeatureView, - entity_keys: List[EntityKeyProto], - requested_features: Optional[List[str]] = None, + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] @@ -178,13 +180,13 @@ def online_read( @log_exceptions_and_usage(online_store="postgres") def update( - self, - config: RepoConfig, - tables_to_delete: Sequence[FeatureView], - tables_to_keep: Sequence[FeatureView], - entities_to_delete: Sequence[Entity], - entities_to_keep: Sequence[Entity], - partial: bool, + self, + config: RepoConfig, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, ): project = config.project schema_name = config.online_store.db_schema or config.online_store.user @@ -239,10 +241,10 @@ def update( conn.commit() def teardown( - self, - config: RepoConfig, - tables: Sequence[FeatureView], - entities: Sequence[Entity], + self, + config: RepoConfig, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): project = config.project try: @@ -298,21 +300,23 @@ class PostgresDocumentStoreConfig(DocumentStoreIndexConfig): class PostgresDocumentStore(PostgreSQLOnlineStore, DocumentStore): - - def online_search(self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - embedding: np.ndarray, - top_k: int, - ): + def online_search( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embedding: np.ndarray, + top_k: int, + ): result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] with self._get_conn(config) as conn, conn.cursor() as cur: - cur.execute(SEARCH_QUERY_TEMPLATE.format( - table_name=table, - feature_name=requested_feature - ), (embedding, top_k)) + cur.execute( + SEARCH_QUERY_TEMPLATE.format( + table_name=table, feature_name=requested_feature + ), + (embedding, top_k), + ) rows = cur.fetchall() for row in rows: @@ -334,17 +338,16 @@ def online_search(self, res[feature_name] = val result.append((event_ts, res)) - return result - def create_index(self, - config: RepoConfig, - index: str, - index_config: DocumentStoreIndexConfig - ): + def create_index( + self, config: RepoConfig, index: str, index_config: DocumentStoreIndexConfig + ): with self._get_conn(config) as conn, conn.cursor() as cur: - cur.execute(CREATE_INDEX_QUERY_TEMPLATE.format( - table_name=config.project, - index_type=index, - embeding_type=index_config.embedding_type - )) + cur.execute( + CREATE_INDEX_QUERY_TEMPLATE.format( + table_name=config.project, + index_type=index, + embeding_type=index_config.embedding_type, + ) + ) diff --git a/sdk/python/feast/infra/online_stores/document_store.py b/sdk/python/feast/infra/online_stores/document_store.py index f0b64c853b3..248d1caa8c7 100644 --- a/sdk/python/feast/infra/online_stores/document_store.py +++ b/sdk/python/feast/infra/online_stores/document_store.py @@ -1,11 +1,13 @@ from abc import abstractmethod from datetime import datetime +from typing import Dict, List, Optional, Tuple + +import numpy as np + from feast.feature_view import FeatureView from feast.protos.feast.types.Value_pb2 import Value as ValueProto -from feast.repo_config import RepoConfig, FeastConfigBaseModel +from feast.repo_config import FeastConfigBaseModel, RepoConfig from infra.online_stores.online_store import OnlineStore -from typing import Optional, List, Tuple, Dict -import numpy as np class DocumentStoreIndexConfig(FeastConfigBaseModel): @@ -16,23 +18,18 @@ class DocumentStore(OnlineStore): index: Optional[str] @abstractmethod - def online_search(self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - embeddings: np.ndarray, - top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: - raise NotImplementedError( - "You have to implement this!" - ) + def online_search( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embeddings: np.ndarray, + top_k: int, + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + raise NotImplementedError("You have to implement this!") @abstractmethod - def create_index(self, - config: RepoConfig, - index: str, - index_config: DocumentStoreIndexConfig - ): - raise NotImplementedError( - "You have to implement this!" - ) + def create_index( + self, config: RepoConfig, index: str, index_config: DocumentStoreIndexConfig + ): + raise NotImplementedError("You have to implement this!") diff --git a/sdk/python/feast/infra/online_stores/helpers.py b/sdk/python/feast/infra/online_stores/helpers.py index 73964c52ff0..af6931c2653 100644 --- a/sdk/python/feast/infra/online_stores/helpers.py +++ b/sdk/python/feast/infra/online_stores/helpers.py @@ -8,8 +8,8 @@ serialize_entity_key, serialize_entity_key_prefix, ) -from feast.infra.online_stores.online_store import OnlineStore from feast.infra.online_stores.document_store import DocumentStore +from feast.infra.online_stores.online_store import OnlineStore from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index d0b0819bc41..bbd93fff545 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -19,7 +19,10 @@ ) from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.offline_stores.offline_utils import get_offline_store_from_config -from feast.infra.online_stores.helpers import get_online_store_from_config, get_document_store_from_config +from feast.infra.online_stores.helpers import ( + get_document_store_from_config, + get_online_store_from_config, +) from feast.infra.provider import Provider from feast.infra.registry.base_registry import BaseRegistry from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto @@ -58,7 +61,7 @@ def online_store(self): self.repo_config.online_store ) return self._online_store - + @property def document_store(self): if not self._document_store: @@ -207,15 +210,16 @@ def online_search( table: FeatureView, requested_feature: str, embeddings: np.ndarray, - top_k: int + top_k: int, ) -> List: set_usage_attribute("provider", self.__class__.__name__) result = [] if self.document_store: - result = self.document_store.online_search(config, table, requested_feature, embeddings, top_k) + result = self.document_store.online_search( + config, table, requested_feature, embeddings, top_k + ) return result - def ingest_df( self, feature_view: FeatureView, diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 2f04d86fbee..9bdb049ecb0 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -3,8 +3,8 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union -import pandas as pd import numpy as np +import pandas as pd import pyarrow from tqdm import tqdm @@ -42,13 +42,13 @@ def __init__(self, config: RepoConfig): @abstractmethod def update_infra( - self, - project: str, - tables_to_delete: Sequence[FeatureView], - tables_to_keep: Sequence[FeatureView], - entities_to_delete: Sequence[Entity], - entities_to_keep: Sequence[Entity], - partial: bool, + self, + project: str, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, ): """ Reconciles cloud resources with the specified set of Feast objects. @@ -67,7 +67,7 @@ def update_infra( pass def plan_infra( - self, config: RepoConfig, desired_registry_proto: RegistryProto + self, config: RepoConfig, desired_registry_proto: RegistryProto ) -> Infra: """ Returns the Infra required to support the desired registry. @@ -80,10 +80,10 @@ def plan_infra( @abstractmethod def teardown_infra( - self, - project: str, - tables: Sequence[FeatureView], - entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): """ Tears down all cloud resources for the specified set of Feast objects. @@ -97,13 +97,13 @@ def teardown_infra( @abstractmethod def online_write_batch( - self, - config: RepoConfig, - table: FeatureView, - data: List[ - Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] - ], - progress: Optional[Callable[[int], Any]], + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], ) -> None: """ Writes a batch of feature rows to the online store. @@ -122,9 +122,9 @@ def online_write_batch( pass def ingest_df( - self, - feature_view: FeatureView, - df: pd.DataFrame, + self, + feature_view: FeatureView, + df: pd.DataFrame, ): """ Persists a dataframe to the online store. @@ -136,9 +136,9 @@ def ingest_df( pass def ingest_df_to_offline_store( - self, - feature_view: FeatureView, - df: pyarrow.Table, + self, + feature_view: FeatureView, + df: pyarrow.Table, ): """ Persists a dataframe to the offline store. @@ -151,14 +151,14 @@ def ingest_df_to_offline_store( @abstractmethod def materialize_single_feature_view( - self, - config: RepoConfig, - feature_view: FeatureView, - start_date: datetime, - end_date: datetime, - registry: BaseRegistry, - project: str, - tqdm_builder: Callable[[int], tqdm], + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: BaseRegistry, + project: str, + tqdm_builder: Callable[[int], tqdm], ) -> None: """ Writes latest feature values in the specified time range to the online store. @@ -176,14 +176,14 @@ def materialize_single_feature_view( @abstractmethod def get_historical_features( - self, - config: RepoConfig, - feature_views: List[FeatureView], - feature_refs: List[str], - entity_df: Union[pd.DataFrame, str], - registry: BaseRegistry, - project: str, - full_feature_names: bool, + self, + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool, ) -> RetrievalJob: """ Retrieves the point-in-time correct historical feature values for the specified entity rows. @@ -208,11 +208,11 @@ def get_historical_features( @abstractmethod def online_read( - self, - config: RepoConfig, - table: FeatureView, - entity_keys: List[EntityKeyProto], - requested_features: Optional[List[str]] = None, + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: """ Reads features values for the given entity keys. @@ -232,7 +232,7 @@ def online_read( @abstractmethod def retrieve_saved_dataset( - self, config: RepoConfig, dataset: SavedDataset + self, config: RepoConfig, dataset: SavedDataset ) -> RetrievalJob: """ Reads a saved dataset. @@ -248,11 +248,11 @@ def retrieve_saved_dataset( @abstractmethod def write_feature_service_logs( - self, - feature_service: FeatureService, - logs: Union[pyarrow.Table, Path], - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + logs: Union[pyarrow.Table, Path], + config: RepoConfig, + registry: BaseRegistry, ): """ Writes features and entities logged by a feature server to the offline store. @@ -270,12 +270,12 @@ def write_feature_service_logs( @abstractmethod def retrieve_feature_service_logs( - self, - feature_service: FeatureService, - start_date: datetime, - end_date: datetime, - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + start_date: datetime, + end_date: datetime, + config: RepoConfig, + registry: BaseRegistry, ) -> RetrievalJob: """ Reads logged features for the specified time window. @@ -298,12 +298,12 @@ def get_feature_server_endpoint(self) -> Optional[str]: @abstractmethod def online_search( - self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - document: Union[str, np.ndarray], - top_k: int, + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + document: Union[str, np.ndarray], + top_k: int, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: """ Searches for the top-k nearest neighbors of the given document in the online document store. diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 15c711567f2..cc12bc523e3 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -63,7 +63,6 @@ "mysql": "feast.infra.online_stores.contrib.mysql_online_store.mysql.MySQLOnlineStore", "rockset": "feast.infra.online_stores.contrib.rockset_online_store.rockset.RocksetOnlineStore", "hazelcast": "feast.infra.online_stores.contrib.hazelcast_online_store.hazelcast_online_store.HazelcastOnlineStore", - # below are supported Online Document Store "postgresDocument": "feast.infra.online_stores.contrib.postgres.PostgresDocumentStore", } From 2cd73d199f7673c3aa6902dbf2243a66a7ef0092 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 31 Mar 2024 16:45:50 -0700 Subject: [PATCH 03/40] feat: add document store --- .../infra/online_stores/contrib/postgres.py | 11 +++++---- .../infra/online_stores/document_store.py | 24 +++++++------------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 976ce334059..150c7398f1d 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -341,13 +341,16 @@ def online_search( return result def create_index( - self, config: RepoConfig, index: str, index_config: DocumentStoreIndexConfig + self, + config: RepoConfig, + table: str ): + document_store_config = config.document_store_config with self._get_conn(config) as conn, conn.cursor() as cur: cur.execute( CREATE_INDEX_QUERY_TEMPLATE.format( - table_name=config.project, - index_type=index, - embeding_type=index_config.embedding_type, + table=table, + index_type=document_store_config.index_type, + embeding_type=document_store_config.embedding_type, ) ) diff --git a/sdk/python/feast/infra/online_stores/document_store.py b/sdk/python/feast/infra/online_stores/document_store.py index 248d1caa8c7..e1a1f6f05df 100644 --- a/sdk/python/feast/infra/online_stores/document_store.py +++ b/sdk/python/feast/infra/online_stores/document_store.py @@ -7,29 +7,23 @@ from feast.feature_view import FeatureView from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import FeastConfigBaseModel, RepoConfig -from infra.online_stores.online_store import OnlineStore +from feast.infra.online_stores.online_store import OnlineStore class DocumentStoreIndexConfig(FeastConfigBaseModel): + index_type: Optional[str] embedding_type: Optional[str] class DocumentStore(OnlineStore): - index: Optional[str] @abstractmethod def online_search( - self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - embeddings: np.ndarray, - top_k: int, + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embeddings: np.ndarray, + top_k: int, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: - raise NotImplementedError("You have to implement this!") - - @abstractmethod - def create_index( - self, config: RepoConfig, index: str, index_config: DocumentStoreIndexConfig - ): - raise NotImplementedError("You have to implement this!") + raise NotImplementedError("You have to implement this!") \ No newline at end of file From d2e0a5998340eae73146029a5adfda4d611bcb2d Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 31 Mar 2024 16:46:09 -0700 Subject: [PATCH 04/40] feat: add document store --- .../infra/online_stores/contrib/postgres.py | 6 +----- .../feast/infra/online_stores/document_store.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 150c7398f1d..d2ff6111cff 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -340,11 +340,7 @@ def online_search( return result - def create_index( - self, - config: RepoConfig, - table: str - ): + def create_index(self, config: RepoConfig, table: str): document_store_config = config.document_store_config with self._get_conn(config) as conn, conn.cursor() as cur: cur.execute( diff --git a/sdk/python/feast/infra/online_stores/document_store.py b/sdk/python/feast/infra/online_stores/document_store.py index e1a1f6f05df..e7edf8b5411 100644 --- a/sdk/python/feast/infra/online_stores/document_store.py +++ b/sdk/python/feast/infra/online_stores/document_store.py @@ -5,9 +5,9 @@ import numpy as np from feast.feature_view import FeatureView +from feast.infra.online_stores.online_store import OnlineStore from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.repo_config import FeastConfigBaseModel, RepoConfig -from feast.infra.online_stores.online_store import OnlineStore class DocumentStoreIndexConfig(FeastConfigBaseModel): @@ -16,14 +16,13 @@ class DocumentStoreIndexConfig(FeastConfigBaseModel): class DocumentStore(OnlineStore): - @abstractmethod def online_search( - self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - embeddings: np.ndarray, - top_k: int, + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embeddings: np.ndarray, + top_k: int, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: - raise NotImplementedError("You have to implement this!") \ No newline at end of file + raise NotImplementedError("You have to implement this!") From 7079e7ffa5fdfa174d0b2d44b4bbd8bc5a18838d Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:06:36 -0700 Subject: [PATCH 05/40] remove DocumentStore --- sdk/python/feast/feature_store.py | 34 +++-- .../infra/online_stores/contrib/postgres.py | 134 ++++++++---------- .../infra/online_stores/document_store.py | 28 ---- .../feast/infra/online_stores/helpers.py | 10 -- .../feast/infra/online_stores/online_store.py | 20 +++ .../feast/infra/passthrough_provider.py | 20 +-- sdk/python/feast/infra/provider.py | 7 +- 7 files changed, 105 insertions(+), 148 deletions(-) delete mode 100644 sdk/python/feast/infra/online_stores/document_store.py diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 732f609350f..84d84d9f65d 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1686,37 +1686,41 @@ def _get_online_features( return OnlineResponse(online_features_response) @log_exceptions_and_usage - def get_top_k_document_features( + def retrieve_online_documents( self, feature: str, - document: Union[str, np.ndarray], + query: Union[str, List[float]], top_k: int, ) -> OnlineResponse: """ - Retrieves the top k cloeses document features. + Retrieves the top k closest document features. Args: feature: The list of document features that should be retrieved from the online document store. These features can be specified either as a list of string document feature references or as a feature service. String feature references must have format "feature_view:feature", e.g, "document_fv:document_embedding_feature". - document: The document to retrieve the closest document features for. + query: The query to retrieve the closest document features for. top_k: The number of closest document features to retrieve. """ - return self._get_top_k_document_features( + return self._retrieve_online_documents( feature=feature, - document=document, + query=query, top_k=top_k, ) - def _get_top_k_document_features( + def _retrieve_online_documents( self, feature: str, - document: Union[str, np.ndarray], + query: Union[str, List[float]], top_k: int, ): + if isinstance(query, str): + raise ValueError( + "Using embedding functionality is not supported for document retrieval. Please embed the query before calling retrieve_online_documents." + ) ( requested_feature_views, - requested_on_demand_feature_views, + _, ) = self._get_feature_views_to_use( features=[feature], allow_cache=True, hide_dummy_entity=False ) @@ -1724,11 +1728,11 @@ def _get_top_k_document_features( feature.split(":")[1] if isinstance(feature, str) else feature ) provider = self._get_provider() - document_features = self._search_from_document_store( + document_features = self._retrieve_from_online_store( provider, requested_feature_views[0], requested_feature, - document, + query, top_k, ) online_features_response = GetOnlineFeaturesResponse(results=[]) @@ -1958,22 +1962,22 @@ def _read_from_online_store( read_row_protos.append((event_timestamps, statuses, values)) return read_row_protos - def _search_from_document_store( + def _retrieve_from_online_store( self, provider: Provider, table: FeatureView, requested_feature: str, - document: Union[str, np.ndarray], + query: Union[str, List[float]], top_k: int, ) -> List[Tuple[List[Timestamp], List["FieldStatus.ValueType"], List[Value]]]: """ Search and return document features from the online document store. """ - documents = provider.online_search( + documents = provider.retrieve_online_documents( config=self.config, table=table, requested_feature=requested_feature, - document=document, + query=query, top_k=top_k, ) # Each row is a set of features for a given entity key. We only need to convert diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index d2ff6111cff..e671bcff2ec 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -14,10 +14,6 @@ from feast import Entity from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key -from feast.infra.online_stores.document_store import ( - DocumentStore, - DocumentStoreIndexConfig, -) from feast.infra.online_stores.online_store import OnlineStore from feast.infra.utils.postgres.connection_utils import _get_conn, _get_connection_pool from feast.infra.utils.postgres.postgres_config import ConnectionType, PostgreSQLConfig @@ -27,6 +23,22 @@ from feast.usage import log_exceptions_and_usage +# Search query template to find the top k items that are closest to the given embedding +# SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; +SEARCH_QUERY_TEMPLATE = """ +SELECT feature_name, value, event_ts FROM {table_name} +WHERE feature_name = '{feature_name}' +ORDER BY value <-> %s +LIMIT %s; +""" + + +# Create index query template to create a index based on the index type +CREATE_INDEX_QUERY_TEMPLATE = """ +CREATE INDEX ON {table_name} USING {index_type} (embedding {embeding_type}); +""" + + class PostgreSQLOnlineStoreConfig(PostgreSQLConfig): type: Literal["postgres"] = "postgres" @@ -256,6 +268,48 @@ def teardown( logging.exception("Teardown failed") raise + def retrieve_online_documents( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embedding: List[float], + top_k: int, + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + """ + + Args: + config: Feast configuration object + table: FeatureView object as the table to search + requested_feature: The requested feature as the column to search + embedding: The query embedding to search for + top_k: The number of items to return + Returns: + List of tuples containing the event timestamp and the document feature + + """ + + # Convert the embedding to a string to be used in postgres vector search + query_embedding_str = f"'[{','.join(str(el) for el in embedding)}]'" + + result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] + with self._get_conn(config) as conn, conn.cursor() as cur: + cur.execute( + SEARCH_QUERY_TEMPLATE.format( + table_name=table, feature_name=requested_feature + ), + (query_embedding_str, top_k), + ) + rows = cur.fetchall() + + for feature_name, value, event_ts in rows: + val = ValueProto() + val.ParseFromString(value) + + res = {feature_name: val} + result.append((event_ts, res)) + + return result def _table_id(project: str, table: FeatureView) -> str: return f"{project}_{table.name}" @@ -278,75 +332,3 @@ def _to_naive_utc(ts: datetime): return ts else: return ts.astimezone(pytz.utc).replace(tzinfo=None) - - -# Search query template to find the top k items that are closest to the given embedding -# SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; -SEARCH_QUERY_TEMPLATE = """ -SELECT entity_key, feature_name, value, event_ts FROM {table_name} -WHERE feature_name = '{feature_name}' -ORDER BY value <-> %s -LIMIT %s; -""" - -# Create index query template to create a index based on the index type -CREATE_INDEX_QUERY_TEMPLATE = """ -CREATE INDEX ON {table_name} USING {index_type} (embedding {embeding_type}); -""" - - -class PostgresDocumentStoreConfig(DocumentStoreIndexConfig): - type: Literal["postgres"] = "postgres" - - -class PostgresDocumentStore(PostgreSQLOnlineStore, DocumentStore): - def online_search( - self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - embedding: np.ndarray, - top_k: int, - ): - result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] - - with self._get_conn(config) as conn, conn.cursor() as cur: - cur.execute( - SEARCH_QUERY_TEMPLATE.format( - table_name=table, feature_name=requested_feature - ), - (embedding, top_k), - ) - rows = cur.fetchall() - - for row in rows: - # The first column is the entity key - entity_key = EntityKeyProto() - entity_key.ParseFromString(row[0]) - - # The second column is the feature name - feature_name = row[1] - - # The third column is the embedding value - val = ValueProto() - val.ParseFromString(row[2]) - - # The fourth column is the event timestamp - event_ts = row[3] - - res = {} - res[feature_name] = val - result.append((event_ts, res)) - - return result - - def create_index(self, config: RepoConfig, table: str): - document_store_config = config.document_store_config - with self._get_conn(config) as conn, conn.cursor() as cur: - cur.execute( - CREATE_INDEX_QUERY_TEMPLATE.format( - table=table, - index_type=document_store_config.index_type, - embeding_type=document_store_config.embedding_type, - ) - ) diff --git a/sdk/python/feast/infra/online_stores/document_store.py b/sdk/python/feast/infra/online_stores/document_store.py deleted file mode 100644 index e7edf8b5411..00000000000 --- a/sdk/python/feast/infra/online_stores/document_store.py +++ /dev/null @@ -1,28 +0,0 @@ -from abc import abstractmethod -from datetime import datetime -from typing import Dict, List, Optional, Tuple - -import numpy as np - -from feast.feature_view import FeatureView -from feast.infra.online_stores.online_store import OnlineStore -from feast.protos.feast.types.Value_pb2 import Value as ValueProto -from feast.repo_config import FeastConfigBaseModel, RepoConfig - - -class DocumentStoreIndexConfig(FeastConfigBaseModel): - index_type: Optional[str] - embedding_type: Optional[str] - - -class DocumentStore(OnlineStore): - @abstractmethod - def online_search( - self, - config: RepoConfig, - table: FeatureView, - requested_feature: str, - embeddings: np.ndarray, - top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: - raise NotImplementedError("You have to implement this!") diff --git a/sdk/python/feast/infra/online_stores/helpers.py b/sdk/python/feast/infra/online_stores/helpers.py index af6931c2653..0e2fdb35007 100644 --- a/sdk/python/feast/infra/online_stores/helpers.py +++ b/sdk/python/feast/infra/online_stores/helpers.py @@ -8,7 +8,6 @@ serialize_entity_key, serialize_entity_key_prefix, ) -from feast.infra.online_stores.document_store import DocumentStore from feast.infra.online_stores.online_store import OnlineStore from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto @@ -22,15 +21,6 @@ def get_online_store_from_config(online_store_config: Any) -> OnlineStore: return online_store_class() -def get_document_store_from_config(document_store_config: Any) -> DocumentStore: - """Creates a document store corresponding to the given online document store config.""" - module_name = document_store_config.__module__ - qualified_name = type(document_store_config).__name__ - class_name = qualified_name.replace("Config", "") - document_store_class = import_class(module_name, class_name, "DocumentStore") - return document_store_class() - - def _redis_key( project: str, entity_key: EntityKeyProto, entity_key_serialization_version=1 ) -> bytes: diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index fcc3376dce2..d5c9a9db78a 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -134,3 +134,23 @@ def teardown( entities: Entities whose corresponding infrastructure should be deleted. """ pass + + def retrieve_online_documents( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + embedding: List[float], + top_k: int, + ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + """ + Retrieves online feature values for the specified embeddings. + + Args: + config: The config for the current feature store. + table: The feature view whose feature values should be read. + requested_feature: The name of the feature whose embeddings should be used for retrieval. + embedding: The embeddings to use for retrieval. + top_k: The number of nearest neighbors to retrieve. + """ + pass \ No newline at end of file diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index bbd93fff545..fddcdcc9628 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -1,7 +1,6 @@ from datetime import datetime, timedelta from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union -import numpy as np import pandas as pd import pyarrow as pa from tqdm import tqdm @@ -20,7 +19,6 @@ from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.offline_stores.offline_utils import get_offline_store_from_config from feast.infra.online_stores.helpers import ( - get_document_store_from_config, get_online_store_from_config, ) from feast.infra.provider import Provider @@ -62,14 +60,6 @@ def online_store(self): ) return self._online_store - @property - def document_store(self): - if not self._document_store: - self._document_store = get_document_store_from_config( - self.repo_config.online_store - ) - return self._document_store - @property def offline_store(self): if not self._offline_store: @@ -204,19 +194,19 @@ def online_read( return result @log_exceptions_and_usage(sampler=RatioSampler(ratio=0.001)) - def online_search( + def retrieve_online_documents( self, config: RepoConfig, table: FeatureView, requested_feature: str, - embeddings: np.ndarray, + embedding: List[float], top_k: int, ) -> List: set_usage_attribute("provider", self.__class__.__name__) result = [] - if self.document_store: - result = self.document_store.online_search( - config, table, requested_feature, embeddings, top_k + if self.online_store: + result = self.online_store.retrieve_online_documents( + config, table, requested_feature, embedding, top_k ) return result diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 9bdb049ecb0..c627af04ba9 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union -import numpy as np import pandas as pd import pyarrow from tqdm import tqdm @@ -297,12 +296,12 @@ def get_feature_server_endpoint(self) -> Optional[str]: return None @abstractmethod - def online_search( + def retrieve_online_documents( self, config: RepoConfig, table: FeatureView, requested_feature: str, - document: Union[str, np.ndarray], + query: List[float], top_k: int, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: """ @@ -312,7 +311,7 @@ def online_search( config: The config for the current feature store. table: The feature view whose embeddings should be searched. requested_feature: the requested document feature name. - document: The document to search for. + query: The query embedding to search for. top_k: The number of nearest neighbors to return. Returns: From 8c9ee97cea0d3aebf7ef572f99f86d5622b6a3cf Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:07:01 -0700 Subject: [PATCH 06/40] format --- sdk/python/feast/feature_store.py | 5 +---- sdk/python/feast/infra/online_stores/contrib/postgres.py | 2 +- sdk/python/feast/infra/online_stores/online_store.py | 2 +- sdk/python/feast/infra/passthrough_provider.py | 4 +--- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 84d84d9f65d..234c7e08cc7 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1718,10 +1718,7 @@ def _retrieve_online_documents( raise ValueError( "Using embedding functionality is not supported for document retrieval. Please embed the query before calling retrieve_online_documents." ) - ( - requested_feature_views, - _, - ) = self._get_feature_views_to_use( + (requested_feature_views, _,) = self._get_feature_views_to_use( features=[feature], allow_cache=True, hide_dummy_entity=False ) requested_feature = ( diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index e671bcff2ec..75b5b47059a 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -22,7 +22,6 @@ from feast.repo_config import RepoConfig from feast.usage import log_exceptions_and_usage - # Search query template to find the top k items that are closest to the given embedding # SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; SEARCH_QUERY_TEMPLATE = """ @@ -311,6 +310,7 @@ def retrieve_online_documents( return result + def _table_id(project: str, table: FeatureView) -> str: return f"{project}_{table.name}" diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index d5c9a9db78a..4cbd935a539 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -153,4 +153,4 @@ def retrieve_online_documents( embedding: The embeddings to use for retrieval. top_k: The number of nearest neighbors to retrieve. """ - pass \ No newline at end of file + pass diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index fddcdcc9628..bc53204f2ab 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -18,9 +18,7 @@ ) from feast.infra.offline_stores.offline_store import RetrievalJob from feast.infra.offline_stores.offline_utils import get_offline_store_from_config -from feast.infra.online_stores.helpers import ( - get_online_store_from_config, -) +from feast.infra.online_stores.helpers import get_online_store_from_config from feast.infra.provider import Provider from feast.infra.registry.base_registry import BaseRegistry from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto From 29d98cd4a8906bcd3b0b1970a7477d9222b5cd85 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:13:54 -0700 Subject: [PATCH 07/40] format --- sdk/python/feast/feature_store.py | 8 +++++--- sdk/python/feast/infra/online_stores/contrib/postgres.py | 1 - sdk/python/feast/infra/online_stores/online_store.py | 5 +++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index ea0128eb4c5..2dcd1cca2a5 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -33,7 +33,6 @@ cast, ) -import numpy as np import pandas as pd import pyarrow as pa from colorama import Fore, Style @@ -1724,7 +1723,10 @@ def _retrieve_online_documents( raise ValueError( "Using embedding functionality is not supported for document retrieval. Please embed the query before calling retrieve_online_documents." ) - (requested_feature_views, _,) = self._get_feature_views_to_use( + ( + requested_feature_views, + _, + ) = self._get_feature_views_to_use( features=[feature], allow_cache=True, hide_dummy_entity=False ) requested_feature = ( @@ -1970,7 +1972,7 @@ def _retrieve_from_online_store( provider: Provider, table: FeatureView, requested_feature: str, - query: Union[str, List[float]], + query: List[float], top_k: int, ) -> List[Tuple[List[Timestamp], List["FieldStatus.ValueType"], List[Value]]]: """ diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 75b5b47059a..e658870239c 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -4,7 +4,6 @@ from datetime import datetime from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple -import numpy as np import psycopg2 import pytz from psycopg2 import sql diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 4cbd935a539..73d569a36aa 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -152,5 +152,10 @@ def retrieve_online_documents( requested_feature: The name of the feature whose embeddings should be used for retrieval. embedding: The embeddings to use for retrieval. top_k: The number of nearest neighbors to retrieve. + + Returns: + A list of top k closest documents to the specified embedding. Each item in the list is a tuple + where the first item is the event timestamp for the row, and the second item is a dict of feature + name to embeddings. """ pass From 11eb97f0a11414070e5f04a2952f57f35e1395b1 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:14:46 -0700 Subject: [PATCH 08/40] format --- sdk/python/feast/infra/online_stores/online_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 73d569a36aa..25fc16daa5b 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -152,7 +152,6 @@ def retrieve_online_documents( requested_feature: The name of the feature whose embeddings should be used for retrieval. embedding: The embeddings to use for retrieval. top_k: The number of nearest neighbors to retrieve. - Returns: A list of top k closest documents to the specified embedding. Each item in the list is a tuple where the first item is the event timestamp for the row, and the second item is a dict of feature From 865baf2348442f77b69a6b156785acbae11ef90d Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:15:43 -0700 Subject: [PATCH 09/40] format --- sdk/python/feast/infra/online_stores/online_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 25fc16daa5b..e7c76826303 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -153,7 +153,7 @@ def retrieve_online_documents( embedding: The embeddings to use for retrieval. top_k: The number of nearest neighbors to retrieve. Returns: - A list of top k closest documents to the specified embedding. Each item in the list is a tuple + object: A list of top k closest documents to the specified embedding. Each item in the list is a tuple where the first item is the event timestamp for the row, and the second item is a dict of feature name to embeddings. """ From 47cd117e48dd7e2e6bd6a51a724f2bf5a3665ea8 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:16:26 -0700 Subject: [PATCH 10/40] format --- sdk/python/feast/infra/online_stores/online_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index e7c76826303..134426b402b 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -152,6 +152,7 @@ def retrieve_online_documents( requested_feature: The name of the feature whose embeddings should be used for retrieval. embedding: The embeddings to use for retrieval. top_k: The number of nearest neighbors to retrieve. + Returns: object: A list of top k closest documents to the specified embedding. Each item in the list is a tuple where the first item is the event timestamp for the row, and the second item is a dict of feature From 3f9f59f15eafaf8e6c9d8b566b7fb07e602edd8c Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:17:19 -0700 Subject: [PATCH 11/40] format --- sdk/python/feast/infra/online_stores/online_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 134426b402b..4c664c5db0d 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -135,6 +135,7 @@ def teardown( """ pass + @abstractmethod def retrieve_online_documents( self, config: RepoConfig, From 79350715784a2e3596f89a046ffae7b36f24843d Mon Sep 17 00:00:00 2001 From: cmuhao Date: Tue, 9 Apr 2024 00:21:27 -0700 Subject: [PATCH 12/40] remove unused vars --- sdk/python/feast/infra/online_stores/contrib/postgres.py | 6 ------ sdk/python/feast/repo_config.py | 5 ----- 2 files changed, 11 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index e658870239c..aa078aa4866 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -31,12 +31,6 @@ """ -# Create index query template to create a index based on the index type -CREATE_INDEX_QUERY_TEMPLATE = """ -CREATE INDEX ON {table_name} USING {index_type} (embedding {embeding_type}); -""" - - class PostgreSQLOnlineStoreConfig(PostgreSQLConfig): type: Literal["postgres"] = "postgres" diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 3265defee5b..fe3491c6fe6 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -63,8 +63,6 @@ "mysql": "feast.infra.online_stores.contrib.mysql_online_store.mysql.MySQLOnlineStore", "rockset": "feast.infra.online_stores.contrib.rockset_online_store.rockset.RocksetOnlineStore", "hazelcast": "feast.infra.online_stores.contrib.hazelcast_online_store.hazelcast_online_store.HazelcastOnlineStore", - # below are supported Online Document Store - "postgresDocument": "feast.infra.online_stores.contrib.postgres.PostgresDocumentStore", } OFFLINE_STORE_CLASS_FOR_TYPE = { @@ -183,9 +181,6 @@ class RepoConfig(FeastBaseModel): coerce_tz_aware: Optional[bool] = True """ If True, coerces entity_df timestamp columns to be timezone aware (to UTC by default). """ - document_store_config: Any = Field(None, alias="document_store") - """ DocumentStoreConfig: Document store configuration (optional depending on provider) """ - def __init__(self, **data: Any): super().__init__(**data) From ba39f93e6fd524c0ab283c1a1ee490e3066136a4 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:26:36 -0700 Subject: [PATCH 13/40] add test --- Makefile | 2 +- sdk/python/feast/feature_store.py | 4 +-- .../infra/online_stores/contrib/postgres.py | 2 +- .../feast/infra/passthrough_provider.py | 1 - sdk/python/tests/conftest.py | 12 +++++++ .../universal/online_store/postgres.py | 33 +++++++++++++++++++ .../online_store/test_universal_online.py | 25 ++++++++++++++ 7 files changed, 74 insertions(+), 5 deletions(-) create mode 100644 sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py diff --git a/Makefile b/Makefile index 813a27f4e3b..6fcf95dc7da 100644 --- a/Makefile +++ b/Makefile @@ -200,7 +200,7 @@ test-python-universal-postgres-offline: test-python-universal-postgres-online: PYTHONPATH='.' \ FULL_REPO_CONFIGS_MODULE=sdk.python.feast.infra.online_stores.contrib.postgres_repo_configuration \ - PYTEST_PLUGINS=sdk.python.feast.infra.offline_stores.contrib.postgres_offline_store.tests \ + PYTEST_PLUGINS=sdk.python.tests.integration.feature_repos.universal.online_store.postgres \ python -m pytest -n 8 --integration \ -k "not test_universal_cli and \ not test_go_feature_server and \ diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 2dcd1cca2a5..3e5f3400fbe 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1698,12 +1698,12 @@ def retrieve_online_documents( top_k: int, ) -> OnlineResponse: """ - Retrieves the top k closest document features. + Retrieves the top k closest document features. Note, embeddings are a subset of features. Args: feature: The list of document features that should be retrieved from the online document store. These features can be specified either as a list of string document feature references or as a feature service. String feature - references must have format "feature_view:feature", e.g, "document_fv:document_embedding_feature". + references must have format "feature_view:feature", e.g, "document_fv:document_embeddings". query: The query to retrieve the closest document features for. top_k: The number of closest document features to retrieve. """ diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index aa078aa4866..e8a1b440093 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -267,7 +267,7 @@ def retrieve_online_documents( requested_feature: str, embedding: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: """ Args: diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index bc53204f2ab..6adafaca848 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -47,7 +47,6 @@ def __init__(self, config: RepoConfig): self.repo_config = config self._offline_store = None self._online_store = None - self._document_store = None self._batch_engine: Optional[BatchMaterializationEngine] = None @property diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 1c9a958ce36..8d1c7d85af7 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -405,3 +405,15 @@ def fake_ingest_data(): "created": [pd.Timestamp(datetime.utcnow()).round("ms")], } return pd.DataFrame(data) + + +@pytest.fixture +def fake_ingest_document_data(): + """Fake document data to ingest into the feature store""" + data = { + "driver_id": [1], + "doc": [4, 5], + "event_timestamp": [pd.Timestamp(datetime.utcnow()).round("ms")], + "created": [pd.Timestamp(datetime.utcnow()).round("ms")], + } + return pd.DataFrame(data) \ No newline at end of file diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py new file mode 100644 index 00000000000..49b8cd81cb5 --- /dev/null +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -0,0 +1,33 @@ +from typing import Dict + +from testcontainers.postgres import PostgresContainer + +from tests.integration.feature_repos.universal.online_store_creator import ( + OnlineStoreCreator, +) + + +class PostgresOnlieStoreCreator(OnlineStoreCreator): + def __init__(self, project_name: str, **kwargs): + super().__init__(project_name) + self.container = ( + PostgresContainer("postgres:latest", platform="linux/amd64") + .with_exposed_ports(5432) + .with_env("POSTGRES_USER", "root") + .with_env("POSTGRES_PASSWORD", "test") + .with_env("POSTGRES_DB", "test") + ) + + def create_online_store(self) -> Dict[str, str]: + self.container.start() + exposed_port = self.container.get_exposed_port(5432) + return { + "type": "postgres", + "user": "root", + "password": "test", + "database": "test", + "port": exposed_port, + } + + def teardown(self): + self.container.stop() diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 82189713151..c1305479e3a 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -785,3 +785,28 @@ def assert_feature_service_entity_mapping_correctness( entity_rows=entity_rows, full_feature_names=full_feature_names, ) + + +@pytest.mark.integration +@pytest.mark.universal_online_stores(only=["postgres"]) +def test_retrieve_online_documents( + environment, universal_data_sources, fake_ingest_document_data +): + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + driver_hourly_stats = create_driver_hourly_stats_feature_view(data_sources.driver) + driver_entity = driver() + + # Register Feature View and Entity + fs.apply([driver_hourly_stats, driver_entity]) + + # directly ingest data into the Online Store + fs.write_to_online_store("document_fv", fake_ingest_document_data) + + # retrieve the online documents + documents = fs.retrieve_online_documents( + feature="document_fv:doc", + query="[1, 2]", + top_k=5 + ) + assert len(documents) == 2 \ No newline at end of file From cf53c71a7ce4b3967b913d0f5bbb579331bead15 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:26:55 -0700 Subject: [PATCH 14/40] add test --- sdk/python/tests/conftest.py | 2 +- .../integration/online_store/test_universal_online.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 8d1c7d85af7..553c75315ec 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -416,4 +416,4 @@ def fake_ingest_document_data(): "event_timestamp": [pd.Timestamp(datetime.utcnow()).round("ms")], "created": [pd.Timestamp(datetime.utcnow()).round("ms")], } - return pd.DataFrame(data) \ No newline at end of file + return pd.DataFrame(data) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index c1305479e3a..8b45345b92c 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -790,7 +790,7 @@ def assert_feature_service_entity_mapping_correctness( @pytest.mark.integration @pytest.mark.universal_online_stores(only=["postgres"]) def test_retrieve_online_documents( - environment, universal_data_sources, fake_ingest_document_data + environment, universal_data_sources, fake_ingest_document_data ): fs = environment.feature_store entities, datasets, data_sources = universal_data_sources @@ -805,8 +805,6 @@ def test_retrieve_online_documents( # retrieve the online documents documents = fs.retrieve_online_documents( - feature="document_fv:doc", - query="[1, 2]", - top_k=5 + feature="document_fv:doc", query="[1, 2]", top_k=5 ) - assert len(documents) == 2 \ No newline at end of file + assert len(documents) == 2 From 92046afeb28c3747e779c5cc37e7a17b59506677 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:28:45 -0700 Subject: [PATCH 15/40] format --- sdk/python/feast/infra/online_stores/online_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 4c664c5db0d..79d8dd992e7 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -143,7 +143,7 @@ def retrieve_online_documents( requested_feature: str, embedding: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: """ Retrieves online feature values for the specified embeddings. From d0acd2d3a5646329bb24c9fba1d981735e69195a Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:29:36 -0700 Subject: [PATCH 16/40] format --- sdk/python/feast/infra/provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index c627af04ba9..c64a38e5e91 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -303,7 +303,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: """ Searches for the top-k nearest neighbors of the given document in the online document store. From cc45f739e1fcd19b54d647e46052f7beac33b278 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:32:43 -0700 Subject: [PATCH 17/40] format --- sdk/python/feast/infra/online_stores/contrib/postgres.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index e8a1b440093..f9944d48988 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -284,7 +284,7 @@ def retrieve_online_documents( # Convert the embedding to a string to be used in postgres vector search query_embedding_str = f"'[{','.join(str(el) for el in embedding)}]'" - result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] + result: List[Tuple[Optional[datetime], Optional[ValueProto]]] = [] with self._get_conn(config) as conn, conn.cursor() as cur: cur.execute( SEARCH_QUERY_TEMPLATE.format( @@ -294,12 +294,12 @@ def retrieve_online_documents( ) rows = cur.fetchall() + event_ts: for feature_name, value, event_ts in rows: val = ValueProto() val.ParseFromString(value) - res = {feature_name: val} - result.append((event_ts, res)) + result.append((event_ts, val)) return result From 006b5c6614ce82a86d10c0cb95a017806ce4ebfa Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:33:12 -0700 Subject: [PATCH 18/40] format --- sdk/python/feast/infra/online_stores/contrib/postgres.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index f9944d48988..e56f5e503f3 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -294,7 +294,6 @@ def retrieve_online_documents( ) rows = cur.fetchall() - event_ts: for feature_name, value, event_ts in rows: val = ValueProto() val.ParseFromString(value) From 6e0ba03936557be26f1f2b8ff06ee4a23c0eb5b7 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:36:33 -0700 Subject: [PATCH 19/40] format --- sdk/python/feast/feature_store.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 3e5f3400fbe..3c140cc2a88 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -2001,11 +2001,8 @@ def _retrieve_from_online_store( statuses = [FieldStatus.NOT_FOUND] values = [null_value] else: - statuses = [] - values = [] - for feature_name, feature_value in feature_data.items(): - statuses.append(FieldStatus.PRESENT) - values.append(feature_value) + statuses = [FieldStatus.PRESENT] + values = [feature_data] read_row_protos.append((event_timestamps, statuses, values)) return read_row_protos From a2302beb14bdf7bd449d377dd126bc3daae556d7 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:47:12 -0700 Subject: [PATCH 20/40] fix not implemented issue --- sdk/python/feast/infra/online_stores/online_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 79d8dd992e7..f37166a4cce 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -135,7 +135,6 @@ def teardown( """ pass - @abstractmethod def retrieve_online_documents( self, config: RepoConfig, @@ -159,4 +158,6 @@ def retrieve_online_documents( where the first item is the event timestamp for the row, and the second item is a dict of feature name to embeddings. """ - pass + raise NotImplementedError( + f"Online store {self.__class__.__name__} does not support online retrieval" + ) \ No newline at end of file From 2e6fc551a7ba9dfe5e3a78c46cd8cc45027ea946 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Wed, 10 Apr 2024 22:47:38 -0700 Subject: [PATCH 21/40] fix not implemented issue --- sdk/python/feast/infra/online_stores/online_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index f37166a4cce..fb5c0f654b3 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -160,4 +160,4 @@ def retrieve_online_documents( """ raise NotImplementedError( f"Online store {self.__class__.__name__} does not support online retrieval" - ) \ No newline at end of file + ) From 3cbbf21d376b7e2836ea1bf6be64e401afe893be Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 15:46:20 -0700 Subject: [PATCH 22/40] fix test --- sdk/python/tests/foo_provider.py | 106 +++++++++--------- .../online_store/test_universal_online.py | 2 +- 2 files changed, 57 insertions(+), 51 deletions(-) diff --git a/sdk/python/tests/foo_provider.py b/sdk/python/tests/foo_provider.py index ba256a3813c..68524a769ac 100644 --- a/sdk/python/tests/foo_provider.py +++ b/sdk/python/tests/foo_provider.py @@ -16,69 +16,70 @@ class FooProvider(Provider): + def __init__(self, config: RepoConfig): pass def update_infra( - self, - project: str, - tables_to_delete: Sequence[FeatureView], - tables_to_keep: Sequence[FeatureView], - entities_to_delete: Sequence[Entity], - entities_to_keep: Sequence[Entity], - partial: bool, + self, + project: str, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, ): pass def teardown_infra( - self, - project: str, - tables: Sequence[FeatureView], - entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): pass def online_write_batch( - self, - config: RepoConfig, - table: FeatureView, - data: List[ - Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] - ], - progress: Optional[Callable[[int], Any]], + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], ) -> None: pass def materialize_single_feature_view( - self, - config: RepoConfig, - feature_view: FeatureView, - start_date: datetime, - end_date: datetime, - registry: BaseRegistry, - project: str, - tqdm_builder: Callable[[int], tqdm], + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: BaseRegistry, + project: str, + tqdm_builder: Callable[[int], tqdm], ) -> None: pass def get_historical_features( - self, - config: RepoConfig, - feature_views: List[FeatureView], - feature_refs: List[str], - entity_df: Union[pandas.DataFrame, str], - registry: BaseRegistry, - project: str, - full_feature_names: bool = False, + self, + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, ) -> RetrievalJob: return RetrievalJob() def online_read( - self, - config: RepoConfig, - table: FeatureView, - entity_keys: List[EntityKeyProto], - requested_features: Optional[List[str]] = None, + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: return [] @@ -86,20 +87,25 @@ def retrieve_saved_dataset(self, config: RepoConfig, dataset: SavedDataset): pass def write_feature_service_logs( - self, - feature_service: FeatureService, - logs: Union[pyarrow.Table, Path], - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + logs: Union[pyarrow.Table, Path], + config: RepoConfig, + registry: BaseRegistry, ): pass def retrieve_feature_service_logs( - self, - feature_service: FeatureService, - start_date: datetime, - end_date: datetime, - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + start_date: datetime, + end_date: datetime, + config: RepoConfig, + registry: BaseRegistry, ) -> RetrievalJob: return RetrievalJob() + + def retrieve_online_documents(self, config: RepoConfig, table: FeatureView, requested_feature: str, + query: List[float], top_k: int) -> List[ + Tuple[Optional[datetime], Optional[ValueProto]]]: + return [] \ No newline at end of file diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 8b45345b92c..5978e2e2fed 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -807,4 +807,4 @@ def test_retrieve_online_documents( documents = fs.retrieve_online_documents( feature="document_fv:doc", query="[1, 2]", top_k=5 ) - assert len(documents) == 2 + assert len(documents) == 2 \ No newline at end of file From ec32764ecfdca2523562c38f60b14c7d490b7292 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 15:46:40 -0700 Subject: [PATCH 23/40] format --- sdk/python/tests/foo_provider.py | 114 +++++++++--------- .../online_store/test_universal_online.py | 2 +- 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/sdk/python/tests/foo_provider.py b/sdk/python/tests/foo_provider.py index 68524a769ac..e280d73e682 100644 --- a/sdk/python/tests/foo_provider.py +++ b/sdk/python/tests/foo_provider.py @@ -16,70 +16,69 @@ class FooProvider(Provider): - def __init__(self, config: RepoConfig): pass def update_infra( - self, - project: str, - tables_to_delete: Sequence[FeatureView], - tables_to_keep: Sequence[FeatureView], - entities_to_delete: Sequence[Entity], - entities_to_keep: Sequence[Entity], - partial: bool, + self, + project: str, + tables_to_delete: Sequence[FeatureView], + tables_to_keep: Sequence[FeatureView], + entities_to_delete: Sequence[Entity], + entities_to_keep: Sequence[Entity], + partial: bool, ): pass def teardown_infra( - self, - project: str, - tables: Sequence[FeatureView], - entities: Sequence[Entity], + self, + project: str, + tables: Sequence[FeatureView], + entities: Sequence[Entity], ): pass def online_write_batch( - self, - config: RepoConfig, - table: FeatureView, - data: List[ - Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] - ], - progress: Optional[Callable[[int], Any]], + self, + config: RepoConfig, + table: FeatureView, + data: List[ + Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] + ], + progress: Optional[Callable[[int], Any]], ) -> None: pass def materialize_single_feature_view( - self, - config: RepoConfig, - feature_view: FeatureView, - start_date: datetime, - end_date: datetime, - registry: BaseRegistry, - project: str, - tqdm_builder: Callable[[int], tqdm], + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: BaseRegistry, + project: str, + tqdm_builder: Callable[[int], tqdm], ) -> None: pass def get_historical_features( - self, - config: RepoConfig, - feature_views: List[FeatureView], - feature_refs: List[str], - entity_df: Union[pandas.DataFrame, str], - registry: BaseRegistry, - project: str, - full_feature_names: bool = False, + self, + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: BaseRegistry, + project: str, + full_feature_names: bool = False, ) -> RetrievalJob: return RetrievalJob() def online_read( - self, - config: RepoConfig, - table: FeatureView, - entity_keys: List[EntityKeyProto], - requested_features: Optional[List[str]] = None, + self, + config: RepoConfig, + table: FeatureView, + entity_keys: List[EntityKeyProto], + requested_features: Optional[List[str]] = None, ) -> List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]]: return [] @@ -87,25 +86,30 @@ def retrieve_saved_dataset(self, config: RepoConfig, dataset: SavedDataset): pass def write_feature_service_logs( - self, - feature_service: FeatureService, - logs: Union[pyarrow.Table, Path], - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + logs: Union[pyarrow.Table, Path], + config: RepoConfig, + registry: BaseRegistry, ): pass def retrieve_feature_service_logs( - self, - feature_service: FeatureService, - start_date: datetime, - end_date: datetime, - config: RepoConfig, - registry: BaseRegistry, + self, + feature_service: FeatureService, + start_date: datetime, + end_date: datetime, + config: RepoConfig, + registry: BaseRegistry, ) -> RetrievalJob: return RetrievalJob() - def retrieve_online_documents(self, config: RepoConfig, table: FeatureView, requested_feature: str, - query: List[float], top_k: int) -> List[ - Tuple[Optional[datetime], Optional[ValueProto]]]: - return [] \ No newline at end of file + def retrieve_online_documents( + self, + config: RepoConfig, + table: FeatureView, + requested_feature: str, + query: List[float], + top_k: int, + ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: + return [] diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 5978e2e2fed..8b45345b92c 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -807,4 +807,4 @@ def test_retrieve_online_documents( documents = fs.retrieve_online_documents( feature="document_fv:doc", query="[1, 2]", top_k=5 ) - assert len(documents) == 2 \ No newline at end of file + assert len(documents) == 2 From e2d8008cf4baa6c8f76263a4652e5b3811bce4d5 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 17:15:22 -0700 Subject: [PATCH 24/40] format --- .../contrib/postgres_offline_store/tests/data_source.py | 4 ++++ .../online_stores/contrib/postgres_repo_configuration.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index f50cdc4c41f..bddd0f10e64 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -13,6 +13,7 @@ ) from feast.infra.utils.postgres.connection_utils import df_to_postgres_table from feast.infra.utils.postgres.postgres_config import PostgreSQLConfig +from feast.feature_logging import LoggingDestination from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) @@ -57,6 +58,9 @@ def postgres_container(): class PostgreSQLDataSourceCreator(DataSourceCreator, OnlineStoreCreator): + def create_logged_features_destination(self) -> LoggingDestination: + pass + def __init__( self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs ): diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 2a9f0d54cd4..4430278a0ba 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -6,5 +6,7 @@ ) FULL_REPO_CONFIGS = [ - IntegrationTestRepoConfig(online_store_creator=PostgreSQLDataSourceCreator), + IntegrationTestRepoConfig( + online_store="postgres", + online_store_creator=PostgreSQLDataSourceCreator), ] From 523d20f26626ab5f6468b7910ee58d8fef0b6458 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 17:15:35 -0700 Subject: [PATCH 25/40] format --- .../contrib/postgres_offline_store/tests/data_source.py | 2 +- .../online_stores/contrib/postgres_repo_configuration.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index bddd0f10e64..2c35ccddb9f 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -7,13 +7,13 @@ from testcontainers.core.waiting_utils import wait_for_logs from feast.data_source import DataSource +from feast.feature_logging import LoggingDestination from feast.infra.offline_stores.contrib.postgres_offline_store.postgres import ( PostgreSQLOfflineStoreConfig, PostgreSQLSource, ) from feast.infra.utils.postgres.connection_utils import df_to_postgres_table from feast.infra.utils.postgres.postgres_config import PostgreSQLConfig -from feast.feature_logging import LoggingDestination from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 4430278a0ba..41760e21eec 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -7,6 +7,6 @@ FULL_REPO_CONFIGS = [ IntegrationTestRepoConfig( - online_store="postgres", - online_store_creator=PostgreSQLDataSourceCreator), + online_store="postgres", online_store_creator=PostgreSQLDataSourceCreator + ), ] From 5cd085d6cef74b23ee3ef818b444a318d7f44c0e Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 17:17:50 -0700 Subject: [PATCH 26/40] format --- .../contrib/postgres_offline_store/tests/data_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index 2c35ccddb9f..973254750eb 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -59,7 +59,7 @@ def postgres_container(): class PostgreSQLDataSourceCreator(DataSourceCreator, OnlineStoreCreator): def create_logged_features_destination(self) -> LoggingDestination: - pass + return None def __init__( self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs From 795699e1a49c9ed49105cf2f7bb46c9d1618a191 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 17:18:32 -0700 Subject: [PATCH 27/40] format --- .../contrib/postgres_offline_store/tests/data_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index 973254750eb..e750923d81c 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -59,7 +59,7 @@ def postgres_container(): class PostgreSQLDataSourceCreator(DataSourceCreator, OnlineStoreCreator): def create_logged_features_destination(self) -> LoggingDestination: - return None + return LoggingDestination() def __init__( self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs From 67b007f70201ec5bebf23f1fc417890ba1fc0f7b Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 17:19:25 -0700 Subject: [PATCH 28/40] format --- .../contrib/postgres_offline_store/tests/data_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py index e750923d81c..a23d90e1868 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/postgres_offline_store/tests/data_source.py @@ -59,7 +59,7 @@ def postgres_container(): class PostgreSQLDataSourceCreator(DataSourceCreator, OnlineStoreCreator): def create_logged_features_destination(self) -> LoggingDestination: - return LoggingDestination() + return None # type: ignore def __init__( self, project_name: str, fixture_request: pytest.FixtureRequest, **kwargs From 33b46bd779df8c87ca2f7ab8cee2438334edffca Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 21:42:52 -0700 Subject: [PATCH 29/40] update testcontainer --- .../feature_repos/universal/online_store/cassandra.py | 6 +++--- .../feature_repos/universal/online_store/postgres.py | 2 +- setup.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py index 190d94a8305..3b513d63a65 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py @@ -17,7 +17,7 @@ import time from typing import Dict -from testcontainers.core.container import DockerContainer +from testcontainers.cassandra import CassandraContainer from testcontainers.core.waiting_utils import wait_for_logs from tests.integration.feature_repos.universal.online_store_creator import ( @@ -28,8 +28,8 @@ class CassandraOnlineStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) - self.container = DockerContainer("library/cassandra:4.0.4").with_exposed_ports( - "9042" + self.container = CassandraContainer("cassandra:4.1.4").with_exposed_ports( + 9042 ) def create_online_store(self) -> Dict[str, object]: diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py index 49b8cd81cb5..08d73d80785 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -11,7 +11,7 @@ class PostgresOnlieStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) self.container = ( - PostgresContainer("postgres:latest", platform="linux/amd64") + PostgresContainer("postgres:16", platform="linux/amd64") .with_exposed_ports(5432) .with_env("POSTGRES_USER", "root") .with_env("POSTGRES_PASSWORD", "test") diff --git a/setup.py b/setup.py index f94fb25bb55..e686ad70620 100644 --- a/setup.py +++ b/setup.py @@ -177,7 +177,7 @@ "pytest-mock==1.10.4", "pytest-env", "Sphinx>4.0.0,<7", - "testcontainers>=3.5,<4", + "testcontainers==4.3.3", "firebase-admin>=5.2.0,<6", "pre-commit<3.3.2", "assertpy==1.1", From 82fe5f1c2e2dac257a1bd032ca6409c575204670 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Thu, 11 Apr 2024 21:44:23 -0700 Subject: [PATCH 30/40] format --- .../feature_repos/universal/online_store/cassandra.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py index 3b513d63a65..41ff6d329d9 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py @@ -28,9 +28,7 @@ class CassandraOnlineStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) - self.container = CassandraContainer("cassandra:4.1.4").with_exposed_ports( - 9042 - ) + self.container = CassandraContainer("cassandra:4.1.4").with_exposed_ports(9042) def create_online_store(self) -> Dict[str, object]: self.container.start() From 061837868345878b57147f15d384963aef7dee9d Mon Sep 17 00:00:00 2001 From: cmuhao Date: Fri, 12 Apr 2024 14:22:34 -0700 Subject: [PATCH 31/40] fix postgres integration test --- .../contrib/postgres_repo_configuration.py | 6 +++--- .../universal/online_store/postgres.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 41760e21eec..e6792c940b5 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -1,5 +1,5 @@ -from feast.infra.offline_stores.contrib.postgres_offline_store.tests.data_source import ( - PostgreSQLDataSourceCreator, +from tests.integration.feature_repos.universal.online_store.postgres import ( + PostgresOnlieStoreCreator, ) from tests.integration.feature_repos.integration_test_repo_config import ( IntegrationTestRepoConfig, @@ -7,6 +7,6 @@ FULL_REPO_CONFIGS = [ IntegrationTestRepoConfig( - online_store="postgres", online_store_creator=PostgreSQLDataSourceCreator + online_store="postgres", online_store_creator=PostgresOnlieStoreCreator ), ] diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py index 08d73d80785..cb1e21dba64 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -11,22 +11,24 @@ class PostgresOnlieStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) self.container = ( - PostgresContainer("postgres:16", platform="linux/amd64") + PostgresContainer( + "postgres:16", + username="root", + password="test", + dbname="test", + ) .with_exposed_ports(5432) - .with_env("POSTGRES_USER", "root") - .with_env("POSTGRES_PASSWORD", "test") - .with_env("POSTGRES_DB", "test") ) def create_online_store(self) -> Dict[str, str]: self.container.start() - exposed_port = self.container.get_exposed_port(5432) return { + "host": "localhost", "type": "postgres", "user": "root", "password": "test", "database": "test", - "port": exposed_port, + "port": self.container.get_exposed_port(5432), } def teardown(self): From 7de2016989f229150dec1f533dc0d7c1097494b2 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Fri, 12 Apr 2024 14:22:59 -0700 Subject: [PATCH 32/40] format --- .../contrib/postgres_repo_configuration.py | 6 +++--- .../universal/online_store/postgres.py | 15 ++++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index e6792c940b5..4446f51e8c8 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -1,9 +1,9 @@ -from tests.integration.feature_repos.universal.online_store.postgres import ( - PostgresOnlieStoreCreator, -) from tests.integration.feature_repos.integration_test_repo_config import ( IntegrationTestRepoConfig, ) +from tests.integration.feature_repos.universal.online_store.postgres import ( + PostgresOnlieStoreCreator, +) FULL_REPO_CONFIGS = [ IntegrationTestRepoConfig( diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py index cb1e21dba64..65758035e52 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -10,15 +10,12 @@ class PostgresOnlieStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) - self.container = ( - PostgresContainer( - "postgres:16", - username="root", - password="test", - dbname="test", - ) - .with_exposed_ports(5432) - ) + self.container = PostgresContainer( + "postgres:16", + username="root", + password="test", + dbname="test", + ).with_exposed_ports(5432) def create_online_store(self) -> Dict[str, str]: self.container.start() From 92fed1d08cee90c3069f1948ef5b42f2eb4a5f34 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 15:33:44 -0700 Subject: [PATCH 33/40] fix postgres test --- sdk/python/feast/feature_store.py | 49 +++++++------ .../infra/online_stores/contrib/postgres.py | 73 +++++++++++++------ .../contrib/postgres_repo_configuration.py | 10 ++- .../feast/infra/online_stores/online_store.py | 2 +- .../feast/infra/passthrough_provider.py | 4 +- sdk/python/feast/infra/provider.py | 2 +- sdk/python/tests/conftest.py | 37 +++++----- sdk/python/tests/data/data_creator.py | 18 +++++ sdk/python/tests/foo_provider.py | 2 +- .../feature_repos/universal/feature_views.py | 1 - .../universal/online_store/postgres.py | 40 +++++++++- .../online_store/test_universal_online.py | 28 +++---- 12 files changed, 179 insertions(+), 87 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 3c140cc2a88..ba2c1dd34d3 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1740,14 +1740,19 @@ def _retrieve_online_documents( query, top_k, ) + document_feature_vals = [feature[2] for feature in document_features] + document_feature_distance_vals = [feature[3] for feature in document_features] online_features_response = GetOnlineFeaturesResponse(results=[]) - self._populate_response_from_feature_data( - document_features, - [], - online_features_response, - False, - requested_feature, - requested_feature_views[0], + + # TODO Refactor to better way of populating result + # TODO populate entity in the response after returning entity in document_features is supported + self._populate_result_rows_from_columnar( + online_features_response=online_features_response, + data={requested_feature: document_feature_vals} + ) + self._populate_result_rows_from_columnar( + online_features_response=online_features_response, + data={"distance": document_feature_distance_vals} ) return OnlineResponse(online_features_response) @@ -1974,7 +1979,7 @@ def _retrieve_from_online_store( requested_feature: str, query: List[float], top_k: int, - ) -> List[Tuple[List[Timestamp], List["FieldStatus.ValueType"], List[Value]]]: + ) -> List[Tuple[Timestamp, "FieldStatus.ValueType", Value, Value]]: """ Search and return document features from the online document store. """ @@ -1985,25 +1990,27 @@ def _retrieve_from_online_store( query=query, top_k=top_k, ) - # Each row is a set of features for a given entity key. We only need to convert - # the data to Protobuf once. + null_value = Value() + not_found_status = FieldStatus.NOT_FOUND + present_status = FieldStatus.PRESENT + read_row_protos = [] + row_ts_proto = Timestamp() - for doc in documents: - row_ts_proto = Timestamp() - row_ts, feature_data = doc - # TODO (Ly): reuse whatever timestamp if row_ts is None? + for row_ts, feature_val, distance in documents: + # Reset timestamp to default or update if row_ts is not None if row_ts is not None: row_ts_proto.FromDatetime(row_ts) - event_timestamps = [row_ts_proto] - if feature_data is None: - statuses = [FieldStatus.NOT_FOUND] - values = [null_value] + + if feature_val is None: + status = not_found_status + value = null_value else: - statuses = [FieldStatus.PRESENT] - values = [feature_data] - read_row_protos.append((event_timestamps, statuses, values)) + status = present_status + value = feature_val + + read_row_protos.append((row_ts_proto, status, value, distance)) return read_row_protos @staticmethod diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index e56f5e503f3..e030193972f 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -9,7 +9,6 @@ from psycopg2 import sql from psycopg2.extras import execute_values from psycopg2.pool import SimpleConnectionPool - from feast import Entity from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key @@ -21,19 +20,16 @@ from feast.repo_config import RepoConfig from feast.usage import log_exceptions_and_usage -# Search query template to find the top k items that are closest to the given embedding -# SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; -SEARCH_QUERY_TEMPLATE = """ -SELECT feature_name, value, event_ts FROM {table_name} -WHERE feature_name = '{feature_name}' -ORDER BY value <-> %s -LIMIT %s; -""" - class PostgreSQLOnlineStoreConfig(PostgreSQLConfig): type: Literal["postgres"] = "postgres" + # Whether to enable the pgvector extension for vector similarity search + pgvector_enabled: Optional[bool] = False + + # If pgvector is enabled, the length of the vector field + vector_len: Optional[int] = 512 + class PostgreSQLOnlineStore(OnlineStore): _conn: Optional[psycopg2._psycopg.connection] = None @@ -77,11 +73,15 @@ def online_write_batch( created_ts = _to_naive_utc(created_ts) for feature_name, val in values.items(): + if config.online_config["pgvector_enabled"]: + val = str(val.float_list_val.val) + else: + val = val.SerializeToString() insert_values.append( ( entity_key_bin, feature_name, - val.SerializeToString(), + val, timestamp, created_ts, ) @@ -221,6 +221,9 @@ def update( for table in tables_to_keep: table_name = _table_id(project, table) + value_type = "BYTEA" + if config.online_config["pgvector_enabled"]: + value_type = f'vector({config.online_config["vector_len"]})' cur.execute( sql.SQL( """ @@ -228,7 +231,7 @@ def update( ( entity_key BYTEA, feature_name TEXT, - value BYTEA, + value {}, event_ts TIMESTAMPTZ, created_ts TIMESTAMPTZ, PRIMARY KEY(entity_key, feature_name) @@ -237,6 +240,7 @@ def update( """ ).format( sql.Identifier(table_name), + sql.SQL(value_type), sql.Identifier(f"{table_name}_ek"), sql.Identifier(table_name), ) @@ -267,7 +271,7 @@ def retrieve_online_documents( requested_feature: str, embedding: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: """ Args: @@ -280,25 +284,50 @@ def retrieve_online_documents( List of tuples containing the event timestamp and the document feature """ + project = config.project # Convert the embedding to a string to be used in postgres vector search - query_embedding_str = f"'[{','.join(str(el) for el in embedding)}]'" + query_embedding_str = f"[{','.join(str(el) for el in embedding)}]" - result: List[Tuple[Optional[datetime], Optional[ValueProto]]] = [] + result: List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]] = [] with self._get_conn(config) as conn, conn.cursor() as cur: + table_name = _table_id(project, table) + + # Search query template to find the top k items that are closest to the given embedding + # SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 5; cur.execute( - SEARCH_QUERY_TEMPLATE.format( - table_name=table, feature_name=requested_feature + sql.SQL( + """ + SELECT + entity_key, + feature_name, + value, + value <-> %s as distance, + event_ts FROM {table_name} + WHERE feature_name = {feature_name} + ORDER BY distance + LIMIT {top_k}; + """ + ).format( + table_name=sql.Identifier(table_name), + feature_name=sql.Literal(requested_feature), + top_k=sql.Literal(top_k) ), - (query_embedding_str, top_k), + (query_embedding_str,), ) rows = cur.fetchall() - for feature_name, value, event_ts in rows: - val = ValueProto() - val.ParseFromString(value) + for entity_key, feature_name, value, distance, event_ts in rows: + + # TODO Deserialize entity_key to return the entity in response + entity_key_proto = EntityKeyProto() + entity_key_proto_bin = bytes(entity_key) + + # TODO Convert to List[float] for value type proto + feature_value_proto = ValueProto(string_val=value) - result.append((event_ts, val)) + distance_value_proto = ValueProto(float_val=distance) + result.append((event_ts, feature_value_proto, distance_value_proto)) return result diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 4446f51e8c8..9663623266c 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -2,11 +2,17 @@ IntegrationTestRepoConfig, ) from tests.integration.feature_repos.universal.online_store.postgres import ( - PostgresOnlieStoreCreator, + PostgresOnlineStoreCreator, + PGVectorOnlineStoreCreator ) FULL_REPO_CONFIGS = [ IntegrationTestRepoConfig( - online_store="postgres", online_store_creator=PostgresOnlieStoreCreator + online_store="postgres", + online_store_creator=PostgresOnlineStoreCreator + ), + IntegrationTestRepoConfig( + online_store="pgvector", + online_store_creator=PGVectorOnlineStoreCreator ), ] diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index fb5c0f654b3..fc1b3d4ad30 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -142,7 +142,7 @@ def retrieve_online_documents( requested_feature: str, embedding: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: """ Retrieves online feature values for the specified embeddings. diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 6adafaca848..ec4df66d43a 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -196,14 +196,14 @@ def retrieve_online_documents( config: RepoConfig, table: FeatureView, requested_feature: str, - embedding: List[float], + query: List[float], top_k: int, ) -> List: set_usage_attribute("provider", self.__class__.__name__) result = [] if self.online_store: result = self.online_store.retrieve_online_documents( - config, table, requested_feature, embedding, top_k + config, table, requested_feature, query, top_k ) return result diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index c64a38e5e91..59355bc0a66 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -303,7 +303,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: """ Searches for the top-k nearest neighbors of the given document in the online document store. diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 553c75315ec..14c422feb92 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -23,9 +23,10 @@ import pytest from _pytest.nodes import Item +from feast.data_source import DataSource from feast.feature_store import FeatureStore # noqa: E402 from feast.wait import wait_retry_backoff # noqa: E402 -from tests.data.data_creator import create_basic_driver_dataset # noqa: E402 +from tests.data.data_creator import create_basic_driver_dataset, create_document_dataset # noqa: E402 from tests.integration.feature_repos.integration_test_repo_config import ( # noqa: E402 IntegrationTestRepoConfig, ) @@ -270,12 +271,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): # aws lambda works only with dynamo if ( - config.get("python_feature_server") - and config.get("provider") == "aws" - and ( + config.get("python_feature_server") + and config.get("provider") == "aws" + and ( not isinstance(online_store, dict) or online_store["type"] != "dynamodb" - ) + ) ): continue @@ -297,8 +298,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): @pytest.fixture def feature_server_endpoint(environment): if ( - not environment.python_feature_server - or environment.test_repo_config.provider != "local" + not environment.python_feature_server + or environment.test_repo_config.provider != "local" ): yield environment.feature_store.get_feature_server_endpoint() return @@ -310,8 +311,8 @@ def feature_server_endpoint(environment): args=(environment.feature_store.repo_path, port), ) if ( - environment.python_feature_server - and environment.test_repo_config.provider == "local" + environment.python_feature_server + and environment.test_repo_config.provider == "local" ): proc.start() # Wait for server to start @@ -354,7 +355,7 @@ def e2e_data_sources(environment: Environment): @pytest.fixture def feature_store_for_online_retrieval( - environment, universal_data_sources + environment, universal_data_sources ) -> Tuple[FeatureStore, List[str], List[Dict[str, int]]]: """ Returns a feature store that is ready for online retrieval, along with entity rows and feature @@ -408,12 +409,10 @@ def fake_ingest_data(): @pytest.fixture -def fake_ingest_document_data(): - """Fake document data to ingest into the feature store""" - data = { - "driver_id": [1], - "doc": [4, 5], - "event_timestamp": [pd.Timestamp(datetime.utcnow()).round("ms")], - "created": [pd.Timestamp(datetime.utcnow()).round("ms")], - } - return pd.DataFrame(data) +def fake_document_data(environment: Environment) -> Tuple[pd.DataFrame, DataSource]: + df = create_document_dataset() + data_source = environment.data_source_creator.create_data_source( + df, + environment.feature_store.project, + ) + return df, data_source diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py index 1fc66aee845..96058cf4013 100644 --- a/sdk/python/tests/data/data_creator.py +++ b/sdk/python/tests/data/data_creator.py @@ -78,3 +78,21 @@ def get_feature_values_for_dtype( return [[n, n] if n is not None else None for n in non_list_val] else: return non_list_val + + +def create_document_dataset() -> pd.DataFrame: + data = { + "item_id": [1, 2, 3], + "embedding_float": [[4.0, 5.0], [1.0, 2.0], [3.0, 4.0]], + "ts": [ + pd.Timestamp(datetime.utcnow()).round("ms"), + pd.Timestamp(datetime.utcnow()).round("ms"), + pd.Timestamp(datetime.utcnow()).round("ms"), + ], + "created_ts": [ + pd.Timestamp(datetime.utcnow()).round("ms"), + pd.Timestamp(datetime.utcnow()).round("ms"), + pd.Timestamp(datetime.utcnow()).round("ms"), + ], + } + return pd.DataFrame(data) \ No newline at end of file diff --git a/sdk/python/tests/foo_provider.py b/sdk/python/tests/foo_provider.py index e280d73e682..7ba4adb114b 100644 --- a/sdk/python/tests/foo_provider.py +++ b/sdk/python/tests/foo_provider.py @@ -111,5 +111,5 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[ValueProto]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: return [] diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index 48f6e27b8ae..8b766c532b3 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -140,7 +140,6 @@ def create_item_embeddings_feature_view(source, infer_features: bool = False): schema=None if infer_features else [ - Field(name="embedding_double", dtype=Array(Float64)), Field(name="embedding_float", dtype=Array(Float32)), ], source=source, diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py index 65758035e52..a03e3458c06 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -1,5 +1,7 @@ from typing import Dict +from testcontainers.core.container import DockerContainer +from testcontainers.core.waiting_utils import wait_for_logs from testcontainers.postgres import PostgresContainer from tests.integration.feature_repos.universal.online_store_creator import ( @@ -7,7 +9,7 @@ ) -class PostgresOnlieStoreCreator(OnlineStoreCreator): +class PostgresOnlineStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) self.container = PostgresContainer( @@ -30,3 +32,39 @@ def create_online_store(self) -> Dict[str, str]: def teardown(self): self.container.stop() + + +class PGVectorOnlineStoreCreator(OnlineStoreCreator): + def __init__(self, project_name: str, **kwargs): + super().__init__(project_name) + self.container = ( + DockerContainer("pgvector/pgvector:pg16") + .with_env("POSTGRES_USER", "root") + .with_env("POSTGRES_PASSWORD", "test") + .with_env("POSTGRES_DB", "test") + .with_exposed_ports(5432) + ) + + def create_online_store(self) -> Dict[str, str]: + self.container.start() + log_string_to_wait_for = "database system is ready to accept connections" + wait_for_logs( + container=self.container, + predicate=log_string_to_wait_for, + timeout=10 + ) + command = "psql -h localhost -p 5432 -U root -d test -c 'CREATE EXTENSION IF NOT EXISTS vector;'" + res = self.container.exec(command) + return { + "host": "localhost", + "type": "postgres", + "user": "root", + "password": "test", + "database": "test", + "pgvector_enabled": True, + "vector_len": 2, + "port": self.container.get_exposed_port(5432), + } + + def teardown(self): + self.container.stop() \ No newline at end of file diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 8b45345b92c..00b44901efa 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -25,10 +25,11 @@ Environment, construct_universal_feature_views, ) -from tests.integration.feature_repos.universal.entities import driver +from tests.integration.feature_repos.universal.entities import driver, item from tests.integration.feature_repos.universal.feature_views import ( create_driver_hourly_stats_feature_view, driver_feature_view, + create_item_embeddings_feature_view ) from tests.utils.data_source_test_creator import prep_file_source @@ -788,23 +789,18 @@ def assert_feature_service_entity_mapping_correctness( @pytest.mark.integration -@pytest.mark.universal_online_stores(only=["postgres"]) +@pytest.mark.universal_online_stores(only=["pgvector"]) def test_retrieve_online_documents( - environment, universal_data_sources, fake_ingest_document_data + environment, fake_document_data ): fs = environment.feature_store - entities, datasets, data_sources = universal_data_sources - driver_hourly_stats = create_driver_hourly_stats_feature_view(data_sources.driver) - driver_entity = driver() - - # Register Feature View and Entity - fs.apply([driver_hourly_stats, driver_entity]) - - # directly ingest data into the Online Store - fs.write_to_online_store("document_fv", fake_ingest_document_data) + df, data_source = fake_document_data + item_embeddings_feature_view = create_item_embeddings_feature_view(data_source) + fs.apply([item_embeddings_feature_view, item()]) + fs.write_to_online_store("item_embeddings", df) - # retrieve the online documents documents = fs.retrieve_online_documents( - feature="document_fv:doc", query="[1, 2]", top_k=5 - ) - assert len(documents) == 2 + feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2 + ).to_dict() + assert len(documents["embedding_float"]) == 2 + From d4f263929aedcbccd79d4cdfb29e8a2e7135ea2d Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 15:34:53 -0700 Subject: [PATCH 34/40] fix postgres test --- sdk/python/feast/infra/online_stores/contrib/postgres.py | 5 +++-- .../online_stores/contrib/postgres_repo_configuration.py | 2 +- sdk/python/tests/conftest.py | 5 ++++- sdk/python/tests/data/data_creator.py | 2 +- .../feature_repos/universal/online_store/postgres.py | 4 ++-- .../tests/integration/online_store/test_universal_online.py | 2 +- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index e030193972f..264c301d670 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -9,6 +9,7 @@ from psycopg2 import sql from psycopg2.extras import execute_values from psycopg2.pool import SimpleConnectionPool + from feast import Entity from feast.feature_view import FeatureView from feast.infra.key_encoding_utils import serialize_entity_key @@ -320,8 +321,8 @@ def retrieve_online_documents( for entity_key, feature_name, value, distance, event_ts in rows: # TODO Deserialize entity_key to return the entity in response - entity_key_proto = EntityKeyProto() - entity_key_proto_bin = bytes(entity_key) + # entity_key_proto = EntityKeyProto() + # entity_key_proto_bin = bytes(entity_key) # TODO Convert to List[float] for value type proto feature_value_proto = ValueProto(string_val=value) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 9663623266c..3eb76136499 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -2,8 +2,8 @@ IntegrationTestRepoConfig, ) from tests.integration.feature_repos.universal.online_store.postgres import ( + PGVectorOnlineStoreCreator, PostgresOnlineStoreCreator, - PGVectorOnlineStoreCreator ) FULL_REPO_CONFIGS = [ diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 14c422feb92..0d40b4d8a11 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -26,7 +26,10 @@ from feast.data_source import DataSource from feast.feature_store import FeatureStore # noqa: E402 from feast.wait import wait_retry_backoff # noqa: E402 -from tests.data.data_creator import create_basic_driver_dataset, create_document_dataset # noqa: E402 +from tests.data.data_creator import ( # noqa: E402 + create_basic_driver_dataset, + create_document_dataset, +) from tests.integration.feature_repos.integration_test_repo_config import ( # noqa: E402 IntegrationTestRepoConfig, ) diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py index 96058cf4013..092af2fa08a 100644 --- a/sdk/python/tests/data/data_creator.py +++ b/sdk/python/tests/data/data_creator.py @@ -95,4 +95,4 @@ def create_document_dataset() -> pd.DataFrame: pd.Timestamp(datetime.utcnow()).round("ms"), ], } - return pd.DataFrame(data) \ No newline at end of file + return pd.DataFrame(data) diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py index a03e3458c06..9f8ad41c5ee 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -54,7 +54,7 @@ def create_online_store(self) -> Dict[str, str]: timeout=10 ) command = "psql -h localhost -p 5432 -U root -d test -c 'CREATE EXTENSION IF NOT EXISTS vector;'" - res = self.container.exec(command) + self.container.exec(command) return { "host": "localhost", "type": "postgres", @@ -67,4 +67,4 @@ def create_online_store(self) -> Dict[str, str]: } def teardown(self): - self.container.stop() \ No newline at end of file + self.container.stop() diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 00b44901efa..1d6d8dc6fd2 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -28,8 +28,8 @@ from tests.integration.feature_repos.universal.entities import driver, item from tests.integration.feature_repos.universal.feature_views import ( create_driver_hourly_stats_feature_view, + create_item_embeddings_feature_view, driver_feature_view, - create_item_embeddings_feature_view ) from tests.utils.data_source_test_creator import prep_file_source From 396d7de27d67be4daef1fb012d50d477ffc48d06 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 15:36:06 -0700 Subject: [PATCH 35/40] fix postgres test --- sdk/python/feast/feature_store.py | 4 ++-- .../infra/online_stores/contrib/postgres.py | 15 ++++++++------- .../contrib/postgres_repo_configuration.py | 6 ++---- sdk/python/feast/infra/provider.py | 2 +- sdk/python/tests/conftest.py | 18 +++++++++--------- .../universal/online_store/postgres.py | 4 +--- .../online_store/test_universal_online.py | 5 +---- 7 files changed, 24 insertions(+), 30 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index ba2c1dd34d3..616cc978a7a 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1748,11 +1748,11 @@ def _retrieve_online_documents( # TODO populate entity in the response after returning entity in document_features is supported self._populate_result_rows_from_columnar( online_features_response=online_features_response, - data={requested_feature: document_feature_vals} + data={requested_feature: document_feature_vals}, ) self._populate_result_rows_from_columnar( online_features_response=online_features_response, - data={"distance": document_feature_distance_vals} + data={"distance": document_feature_distance_vals}, ) return OnlineResponse(online_features_response) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 264c301d670..7074711594a 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -75,14 +75,14 @@ def online_write_batch( for feature_name, val in values.items(): if config.online_config["pgvector_enabled"]: - val = str(val.float_list_val.val) + val_str = str(val.float_list_val.val) else: - val = val.SerializeToString() + val_str = val.SerializeToString() insert_values.append( ( entity_key_bin, feature_name, - val, + val_str, timestamp, created_ts, ) @@ -272,7 +272,7 @@ def retrieve_online_documents( requested_feature: str, embedding: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: """ Args: @@ -290,7 +290,9 @@ def retrieve_online_documents( # Convert the embedding to a string to be used in postgres vector search query_embedding_str = f"[{','.join(str(el) for el in embedding)}]" - result: List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]] = [] + result: List[ + Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]] + ] = [] with self._get_conn(config) as conn, conn.cursor() as cur: table_name = _table_id(project, table) @@ -312,14 +314,13 @@ def retrieve_online_documents( ).format( table_name=sql.Identifier(table_name), feature_name=sql.Literal(requested_feature), - top_k=sql.Literal(top_k) + top_k=sql.Literal(top_k), ), (query_embedding_str,), ) rows = cur.fetchall() for entity_key, feature_name, value, distance, event_ts in rows: - # TODO Deserialize entity_key to return the entity in response # entity_key_proto = EntityKeyProto() # entity_key_proto_bin = bytes(entity_key) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 3eb76136499..003ea00a749 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -8,11 +8,9 @@ FULL_REPO_CONFIGS = [ IntegrationTestRepoConfig( - online_store="postgres", - online_store_creator=PostgresOnlineStoreCreator + online_store="postgres", online_store_creator=PostgresOnlineStoreCreator ), IntegrationTestRepoConfig( - online_store="pgvector", - online_store_creator=PGVectorOnlineStoreCreator + online_store="pgvector", online_store_creator=PGVectorOnlineStoreCreator ), ] diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 59355bc0a66..e71e87488d7 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -303,7 +303,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, - ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: + ) -> List[Tuple[Optional[datetime], Optional[ValueProto], Optional[ValueProto]]]: """ Searches for the top-k nearest neighbors of the given document in the online document store. diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 0d40b4d8a11..6abe30822f2 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -274,12 +274,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): # aws lambda works only with dynamo if ( - config.get("python_feature_server") - and config.get("provider") == "aws" - and ( + config.get("python_feature_server") + and config.get("provider") == "aws" + and ( not isinstance(online_store, dict) or online_store["type"] != "dynamodb" - ) + ) ): continue @@ -301,8 +301,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): @pytest.fixture def feature_server_endpoint(environment): if ( - not environment.python_feature_server - or environment.test_repo_config.provider != "local" + not environment.python_feature_server + or environment.test_repo_config.provider != "local" ): yield environment.feature_store.get_feature_server_endpoint() return @@ -314,8 +314,8 @@ def feature_server_endpoint(environment): args=(environment.feature_store.repo_path, port), ) if ( - environment.python_feature_server - and environment.test_repo_config.provider == "local" + environment.python_feature_server + and environment.test_repo_config.provider == "local" ): proc.start() # Wait for server to start @@ -358,7 +358,7 @@ def e2e_data_sources(environment: Environment): @pytest.fixture def feature_store_for_online_retrieval( - environment, universal_data_sources + environment, universal_data_sources ) -> Tuple[FeatureStore, List[str], List[Dict[str, int]]]: """ Returns a feature store that is ready for online retrieval, along with entity rows and feature diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py index 9f8ad41c5ee..58e7af9c468 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/postgres.py @@ -49,9 +49,7 @@ def create_online_store(self) -> Dict[str, str]: self.container.start() log_string_to_wait_for = "database system is ready to accept connections" wait_for_logs( - container=self.container, - predicate=log_string_to_wait_for, - timeout=10 + container=self.container, predicate=log_string_to_wait_for, timeout=10 ) command = "psql -h localhost -p 5432 -U root -d test -c 'CREATE EXTENSION IF NOT EXISTS vector;'" self.container.exec(command) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 1d6d8dc6fd2..3ae7be9e1e4 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -790,9 +790,7 @@ def assert_feature_service_entity_mapping_correctness( @pytest.mark.integration @pytest.mark.universal_online_stores(only=["pgvector"]) -def test_retrieve_online_documents( - environment, fake_document_data -): +def test_retrieve_online_documents(environment, fake_document_data): fs = environment.feature_store df, data_source = fake_document_data item_embeddings_feature_view = create_item_embeddings_feature_view(data_source) @@ -803,4 +801,3 @@ def test_retrieve_online_documents( feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2 ).to_dict() assert len(documents["embedding_float"]) == 2 - From 6c38b922f5bd26874abe422813a19856f9c5386f Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 15:44:09 -0700 Subject: [PATCH 36/40] fix postgres test --- sdk/python/feast/infra/online_stores/contrib/postgres.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 7074711594a..30d7c372664 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -2,7 +2,7 @@ import logging from collections import defaultdict from datetime import datetime -from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple +from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union import psycopg2 import pytz @@ -74,6 +74,7 @@ def online_write_batch( created_ts = _to_naive_utc(created_ts) for feature_name, val in values.items(): + val_str: Union[str, bytes] if config.online_config["pgvector_enabled"]: val_str = str(val.float_list_val.val) else: From f763dc92d0ad6543bcdc298dceb5f7fba8ff5408 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 15:47:50 -0700 Subject: [PATCH 37/40] fix postgres test --- sdk/python/feast/feature_store.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 616cc978a7a..7809a65b4b6 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1991,26 +1991,22 @@ def _retrieve_from_online_store( top_k=top_k, ) - null_value = Value() - not_found_status = FieldStatus.NOT_FOUND - present_status = FieldStatus.PRESENT - read_row_protos = [] row_ts_proto = Timestamp() - for row_ts, feature_val, distance in documents: + for row_ts, feature_val, distance_val in documents: # Reset timestamp to default or update if row_ts is not None if row_ts is not None: row_ts_proto.FromDatetime(row_ts) if feature_val is None: - status = not_found_status - value = null_value + feature_val = Value() + distance_val = Value() + status = FieldStatus.NOT_FOUND else: - status = present_status - value = feature_val + status = FieldStatus.PRESENT - read_row_protos.append((row_ts_proto, status, value, distance)) + read_row_protos.append((row_ts_proto, status, feature_val, distance_val)) return read_row_protos @staticmethod From 818c0558bab2752080ae828bdd799d118a224040 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 16:32:54 -0700 Subject: [PATCH 38/40] format --- sdk/python/feast/feature_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 7809a65b4b6..15598e1d609 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1999,7 +1999,7 @@ def _retrieve_from_online_store( if row_ts is not None: row_ts_proto.FromDatetime(row_ts) - if feature_val is None: + if feature_val is None or distance_val is None: feature_val = Value() distance_val = Value() status = FieldStatus.NOT_FOUND From a51b555dcd7c2323d1b05864faeec924fbbbd78f Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 17:38:47 -0700 Subject: [PATCH 39/40] format --- sdk/python/feast/infra/key_encoding_utils.py | 12 ++++++++++-- .../feast/infra/online_stores/contrib/postgres.py | 8 ++++---- .../contrib/postgres_repo_configuration.py | 2 ++ sdk/python/tests/data/data_creator.py | 1 + .../feature_repos/universal/feature_views.py | 1 + .../universal/online_store/cassandra.py | 6 ++++-- 6 files changed, 22 insertions(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/key_encoding_utils.py b/sdk/python/feast/infra/key_encoding_utils.py index 62b6b72724e..7b39becc1c6 100644 --- a/sdk/python/feast/infra/key_encoding_utils.py +++ b/sdk/python/feast/infra/key_encoding_utils.py @@ -7,7 +7,7 @@ def _serialize_val( - value_type, v: ValueProto, entity_key_serialization_version=1 + value_type, v: ValueProto, entity_key_serialization_version=1 ) -> Tuple[bytes, int]: if value_type == "string_val": return v.string_val.encode("utf8"), ValueType.STRING @@ -40,7 +40,7 @@ def serialize_entity_key_prefix(entity_keys: List[str]) -> bytes: def serialize_entity_key( - entity_key: EntityKeyProto, entity_key_serialization_version=1 + entity_key: EntityKeyProto, entity_key_serialization_version=1 ) -> bytes: """ Serialize entity key to a bytestring so it can be used as a lookup key in a hash table. @@ -72,3 +72,11 @@ def serialize_entity_key( output.append(val_bytes) return b"".join(output) + + +def get_val_str(val): + accept_value_types = ["float_list_val", "double_list_val", "int_list_val"] + for accept_type in accept_value_types: + if val.HasField(accept_type): + return str(getattr(val, accept_type).val) + return None diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 30d7c372664..a55f4aa37dc 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -12,7 +12,7 @@ from feast import Entity from feast.feature_view import FeatureView -from feast.infra.key_encoding_utils import serialize_entity_key +from feast.infra.key_encoding_utils import serialize_entity_key, get_val_str from feast.infra.online_stores.online_store import OnlineStore from feast.infra.utils.postgres.connection_utils import _get_conn, _get_connection_pool from feast.infra.utils.postgres.postgres_config import ConnectionType, PostgreSQLConfig @@ -75,8 +75,8 @@ def online_write_batch( for feature_name, val in values.items(): val_str: Union[str, bytes] - if config.online_config["pgvector_enabled"]: - val_str = str(val.float_list_val.val) + if "pgvector_enabled" in config.online_config and config.online_config["pgvector_enabled"]: + val_str = get_val_str(val) else: val_str = val.SerializeToString() insert_values.append( @@ -224,7 +224,7 @@ def update( for table in tables_to_keep: table_name = _table_id(project, table) value_type = "BYTEA" - if config.online_config["pgvector_enabled"]: + if "pgvector_enabled" in config.online_config and config.online_config["pgvector_enabled"]: value_type = f'vector({config.online_config["vector_len"]})' cur.execute( sql.SQL( diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py index 003ea00a749..6e4ca3f9501 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres_repo_configuration.py @@ -14,3 +14,5 @@ online_store="pgvector", online_store_creator=PGVectorOnlineStoreCreator ), ] + +AVAILABLE_ONLINE_STORES = {"pgvector": PGVectorOnlineStoreCreator} diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py index 092af2fa08a..1be96f753a7 100644 --- a/sdk/python/tests/data/data_creator.py +++ b/sdk/python/tests/data/data_creator.py @@ -84,6 +84,7 @@ def create_document_dataset() -> pd.DataFrame: data = { "item_id": [1, 2, 3], "embedding_float": [[4.0, 5.0], [1.0, 2.0], [3.0, 4.0]], + "embedding_double": [[4.0, 5.0], [1.0, 2.0], [3.0, 4.0]], "ts": [ pd.Timestamp(datetime.utcnow()).round("ms"), pd.Timestamp(datetime.utcnow()).round("ms"), diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index 8b766c532b3..48f6e27b8ae 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -140,6 +140,7 @@ def create_item_embeddings_feature_view(source, infer_features: bool = False): schema=None if infer_features else [ + Field(name="embedding_double", dtype=Array(Float64)), Field(name="embedding_float", dtype=Array(Float32)), ], source=source, diff --git a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py index 41ff6d329d9..190d94a8305 100644 --- a/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py +++ b/sdk/python/tests/integration/feature_repos/universal/online_store/cassandra.py @@ -17,7 +17,7 @@ import time from typing import Dict -from testcontainers.cassandra import CassandraContainer +from testcontainers.core.container import DockerContainer from testcontainers.core.waiting_utils import wait_for_logs from tests.integration.feature_repos.universal.online_store_creator import ( @@ -28,7 +28,9 @@ class CassandraOnlineStoreCreator(OnlineStoreCreator): def __init__(self, project_name: str, **kwargs): super().__init__(project_name) - self.container = CassandraContainer("cassandra:4.1.4").with_exposed_ports(9042) + self.container = DockerContainer("library/cassandra:4.0.4").with_exposed_ports( + "9042" + ) def create_online_store(self) -> Dict[str, object]: self.container.start() From 2624b22f4b6d3500323cb358b183c26956bd9c02 Mon Sep 17 00:00:00 2001 From: cmuhao Date: Sun, 14 Apr 2024 17:39:48 -0700 Subject: [PATCH 40/40] format --- sdk/python/feast/infra/key_encoding_utils.py | 4 ++-- .../feast/infra/online_stores/contrib/postgres.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sdk/python/feast/infra/key_encoding_utils.py b/sdk/python/feast/infra/key_encoding_utils.py index 7b39becc1c6..e50e438c3de 100644 --- a/sdk/python/feast/infra/key_encoding_utils.py +++ b/sdk/python/feast/infra/key_encoding_utils.py @@ -7,7 +7,7 @@ def _serialize_val( - value_type, v: ValueProto, entity_key_serialization_version=1 + value_type, v: ValueProto, entity_key_serialization_version=1 ) -> Tuple[bytes, int]: if value_type == "string_val": return v.string_val.encode("utf8"), ValueType.STRING @@ -40,7 +40,7 @@ def serialize_entity_key_prefix(entity_keys: List[str]) -> bytes: def serialize_entity_key( - entity_key: EntityKeyProto, entity_key_serialization_version=1 + entity_key: EntityKeyProto, entity_key_serialization_version=1 ) -> bytes: """ Serialize entity key to a bytestring so it can be used as a lookup key in a hash table. diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index a55f4aa37dc..2dcb6187837 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -12,7 +12,7 @@ from feast import Entity from feast.feature_view import FeatureView -from feast.infra.key_encoding_utils import serialize_entity_key, get_val_str +from feast.infra.key_encoding_utils import get_val_str, serialize_entity_key from feast.infra.online_stores.online_store import OnlineStore from feast.infra.utils.postgres.connection_utils import _get_conn, _get_connection_pool from feast.infra.utils.postgres.postgres_config import ConnectionType, PostgreSQLConfig @@ -75,7 +75,10 @@ def online_write_batch( for feature_name, val in values.items(): val_str: Union[str, bytes] - if "pgvector_enabled" in config.online_config and config.online_config["pgvector_enabled"]: + if ( + "pgvector_enabled" in config.online_config + and config.online_config["pgvector_enabled"] + ): val_str = get_val_str(val) else: val_str = val.SerializeToString() @@ -224,7 +227,10 @@ def update( for table in tables_to_keep: table_name = _table_id(project, table) value_type = "BYTEA" - if "pgvector_enabled" in config.online_config and config.online_config["pgvector_enabled"]: + if ( + "pgvector_enabled" in config.online_config + and config.online_config["pgvector_enabled"] + ): value_type = f'vector({config.online_config["vector_len"]})' cur.execute( sql.SQL(