From 2bd1992b0b868410f146cfa03974a3e89b43911b Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 12 May 2025 11:39:31 +0000 Subject: [PATCH 1/6] feat: Add dtype parameters to to_geodataframe This change adds support for `bool_dtype`, `int_dtype`, `float_dtype`, and `string_dtype` parameters to the `to_geodataframe` method in `RowIterator` and `QueryJob`. These parameters allow you to specify the desired pandas dtypes for boolean, integer, float, and string columns when converting BigQuery results to GeoDataFrames. The changes include: - Updating `RowIterator.to_geodataframe` to accept and pass these dtype parameters to the underlying `to_dataframe` method. - Updating `QueryJob.to_geodataframe` to accept and pass these dtype parameters to the underlying `RowIterator.to_geodataframe` method. - Adding unit tests to verify the correct handling of these parameters. --- google/cloud/bigquery/job/query.py | 36 ++++++++++ google/cloud/bigquery/table.py | 44 ++++++++++++ tests/unit/test_table_pandas.py | 103 +++++++++++++++++++++++++++++ 3 files changed, 183 insertions(+) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index f14039bc0..f9b99b7fb 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -2102,6 +2102,10 @@ def to_geodataframe( create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "geopandas.GeoDataFrame": """Return a GeoPandas GeoDataFrame from a QueryJob @@ -2152,6 +2156,34 @@ def to_geodataframe( identifies which one to use to construct a GeoPandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2175,6 +2207,10 @@ def to_geodataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) def __iter__(self): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 503ca4e71..e084468f6 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2727,6 +2727,10 @@ def to_geodataframe( progress_bar_type: Optional[str] = None, create_bqstorage_client: bool = True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "geopandas.GeoDataFrame": """Create a GeoPandas GeoDataFrame by loading all pages of a query. @@ -2778,6 +2782,34 @@ def to_geodataframe( identifies which one to use to construct a geopandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2829,6 +2861,10 @@ def to_geodataframe( progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) return geopandas.GeoDataFrame( @@ -2932,6 +2968,10 @@ def to_geodataframe( progress_bar_type=None, create_bqstorage_client=True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2941,6 +2981,10 @@ def to_geodataframe( progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. geography_column (str): Ignored. Added for compatibility with RowIterator. + bool_dtype (Any): Ignored. Added for compatibility with RowIterator. + int_dtype (Any): Ignored. Added for compatibility with RowIterator. + float_dtype (Any): Ignored. Added for compatibility with RowIterator. + string_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 94737732b..43d64d77d 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -261,3 +261,106 @@ def test_to_dataframe_with_jobs_query_response(class_under_test): "Tiffani", ] assert list(df["number"]) == [6, 325, 26, 10, 17, 22, 6, 229, 8] + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_default_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (1 2)"], + "bool_col": [True], + "int_col": [123], + "float_col": [1.23], + "string_col": ["abc"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + rows.to_geodataframe(geography_column="geo_col") + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=bigquery.enums.DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=bigquery.enums.DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + ) + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_custom_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (3 4)"], + "bool_col": [False], + "int_col": [456], + "float_col": [4.56], + "string_col": ["def"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + custom_bool_dtype = "bool" + custom_int_dtype = "int32" + custom_float_dtype = "float32" + custom_string_dtype = "string" + + rows.to_geodataframe( + geography_column="geo_col", + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + ) From b4edb9c1fb49ab0d6acf99befd70800cfa8b6ca5 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 14 May 2025 13:11:29 +0000 Subject: [PATCH 2/6] updates to several tests re geopandas as well as imports --- noxfile.py | 4 +--- pyproject.toml | 8 ++++++-- testing/constraints-3.9.txt | 2 +- tests/unit/job/test_query_pandas.py | 6 ++++++ tests/unit/test_table.py | 10 ++++++++-- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/noxfile.py b/noxfile.py index c2b4bbb50..1922a68a5 100644 --- a/noxfile.py +++ b/noxfile.py @@ -109,9 +109,7 @@ def default(session, install_extras=True): # that logic (and the associated tests) we avoid installing the [ipython] extra # which has a downstream effect of then avoiding installing bigquery_magics. if install_extras and session.python == UNIT_TEST_PYTHON_VERSIONS[0]: - install_target = ( - ".[bqstorage,pandas,ipywidgets,geopandas,tqdm,opentelemetry,bigquery_v2]" - ) + install_target = ".[bqstorage,pandas,ipywidgets,geopandas,matplotlib,tqdm,opentelemetry,bigquery_v2]" elif install_extras: # run against all other UNIT_TEST_PYTHON_VERSIONS install_target = ".[all]" else: diff --git a/pyproject.toml b/pyproject.toml index 38d74cdd0..058b5979d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,11 @@ pandas = [ ipywidgets = ["ipywidgets >= 7.7.1", "ipykernel >= 6.2.0"] geopandas = ["geopandas >= 0.9.0, < 2.0.0", "Shapely >= 1.8.4, < 3.0.0"] ipython = ["ipython >= 7.23.1", "bigquery-magics >= 0.6.0"] -tqdm = ["tqdm >= 4.7.4, < 5.0.0"] +matplotlib = [ + "matplotlib == 3.9.2; python_version == '3.9'", + "matplotlib >= 3.10.3; python_version >= '3.10'", +] +tqdm = ["tqdm >= 4.23.4, < 5.0.0"] opentelemetry = [ "opentelemetry-api >= 1.1.0", "opentelemetry-sdk >= 1.1.0", @@ -93,7 +97,7 @@ bigquery_v2 = [ "protobuf >= 3.20.2, < 7.0.0, != 4.21.0, != 4.21.1, != 4.21.2, != 4.21.3, != 4.21.4, != 4.21.5", # For the legacy proto-based types. ] all = [ - "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,tqdm,opentelemetry,bigquery_v2]", + "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,matplotlib,tqdm,opentelemetry,bigquery_v2]", ] [tool.setuptools.dynamic] diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index cb6c29f3b..056ef7417 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -29,4 +29,4 @@ pyarrow==4.0.0 python-dateutil==2.8.2 requests==2.21.0 Shapely==1.8.4 -tqdm==4.7.4 +tqdm==4.23.4 diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 2cda59bd1..bc336a41d 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -1000,6 +1000,8 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): max_results = 42 geography_column = "g" + from google.cloud.bigquery.enums import DefaultPandasDTypes + df = job.to_geodataframe( bqstorage_client=bqstorage_client, dtypes=dtypes, @@ -1019,5 +1021,9 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, ) assert df is row_iterator.to_geodataframe.return_value diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 253006547..ca7c4724f 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -4065,7 +4065,7 @@ def test_to_dataframe_no_tqdm(self): def test_to_dataframe_tqdm_error(self): pytest.importorskip("pandas") - pytest.importorskip("tqdm") + tqdm = pytest.importorskip("tqdm") mock.patch("tqdm.tqdm_gui", new=None) mock.patch("tqdm.notebook.tqdm", new=None) mock.patch("tqdm.tqdm", new=None) @@ -4100,7 +4100,7 @@ def test_to_dataframe_tqdm_error(self): for warning in warned: # pragma: NO COVER self.assertIn( warning.category, - [UserWarning, DeprecationWarning], + [UserWarning, DeprecationWarning, tqdm.TqdmExperimentalWarning], ) def test_to_dataframe_w_empty_results(self): @@ -5633,12 +5633,18 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): geography_column=geography_column, ) + from google.cloud.bigquery.enums import DefaultPandasDTypes + to_dataframe.assert_called_once_with( bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, ) self.assertIsInstance(df, geopandas.GeoDataFrame) From e49e808a195de8037f5ba633808860a2838f5a2e Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 14 May 2025 13:28:54 +0000 Subject: [PATCH 3/6] updates to enum import --- tests/unit/job/test_query_pandas.py | 4 ++-- tests/unit/test_table.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index bc336a41d..d82f0dfe3 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -22,6 +22,7 @@ from ..helpers import make_connection from .helpers import _make_client from .helpers import _make_job_resource +from google.cloud.bigquery.enums import DefaultPandasDTypes try: from google.cloud import bigquery_storage @@ -30,6 +31,7 @@ except (ImportError, AttributeError): bigquery_storage = None + try: import shapely except (ImportError, AttributeError): @@ -1000,8 +1002,6 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): max_results = 42 geography_column = "g" - from google.cloud.bigquery.enums import DefaultPandasDTypes - df = job.to_geodataframe( bqstorage_client=bqstorage_client, dtypes=dtypes, diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index ca7c4724f..8daa4ce43 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -31,6 +31,7 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import external_config from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.dataset import DatasetReference @@ -5633,8 +5634,6 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): geography_column=geography_column, ) - from google.cloud.bigquery.enums import DefaultPandasDTypes - to_dataframe.assert_called_once_with( bqstorage_client, dtypes, From 79cde614043469a2be4e87aa414b66bfe324d909 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 14 May 2025 13:31:35 +0000 Subject: [PATCH 4/6] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- docs/conf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 826298090..df1c18b68 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # autodoc/autosummary flags autoclass_content = "both" -autodoc_default_options = {"members": True, "inherited-members": True} +autodoc_default_options = {"members": True} autosummary_generate = True @@ -109,7 +109,6 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [ - "google/cloud/bigquery_v2/**", # Legacy proto-based types. "_build", "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", From ee5533ab1eb34c52858d7d4fe1914f4cf7ce9cc2 Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Wed, 14 May 2025 12:02:05 -0400 Subject: [PATCH 5/6] Update pyproject.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 058b5979d..9c91a2fc8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ ipywidgets = ["ipywidgets >= 7.7.1", "ipykernel >= 6.2.0"] geopandas = ["geopandas >= 0.9.0, < 2.0.0", "Shapely >= 1.8.4, < 3.0.0"] ipython = ["ipython >= 7.23.1", "bigquery-magics >= 0.6.0"] matplotlib = [ - "matplotlib == 3.9.2; python_version == '3.9'", + "matplotlib >= 3.7.1, <= 3.9.2; python_version == '3.9'", "matplotlib >= 3.10.3; python_version >= '3.10'", ] tqdm = ["tqdm >= 4.23.4, < 5.0.0"] From d3314e0955a959d35343c34a4dc0b4bacc0f752c Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Wed, 14 May 2025 12:02:16 -0400 Subject: [PATCH 6/6] Update testing/constraints-3.9.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- testing/constraints-3.9.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 056ef7417..60a155f0d 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -29,4 +29,4 @@ pyarrow==4.0.0 python-dateutil==2.8.2 requests==2.21.0 Shapely==1.8.4 -tqdm==4.23.4 +matplotlib==3.7.1