diff --git a/docs/conf.py b/docs/conf.py index 826298090..df1c18b68 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # autodoc/autosummary flags autoclass_content = "both" -autodoc_default_options = {"members": True, "inherited-members": True} +autodoc_default_options = {"members": True} autosummary_generate = True @@ -109,7 +109,6 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [ - "google/cloud/bigquery_v2/**", # Legacy proto-based types. "_build", "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index f14039bc0..f9b99b7fb 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -2102,6 +2102,10 @@ def to_geodataframe( create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "geopandas.GeoDataFrame": """Return a GeoPandas GeoDataFrame from a QueryJob @@ -2152,6 +2156,34 @@ def to_geodataframe( identifies which one to use to construct a GeoPandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2175,6 +2207,10 @@ def to_geodataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) def __iter__(self): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 503ca4e71..e084468f6 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2727,6 +2727,10 @@ def to_geodataframe( progress_bar_type: Optional[str] = None, create_bqstorage_client: bool = True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "geopandas.GeoDataFrame": """Create a GeoPandas GeoDataFrame by loading all pages of a query. @@ -2778,6 +2782,34 @@ def to_geodataframe( identifies which one to use to construct a geopandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2829,6 +2861,10 @@ def to_geodataframe( progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) return geopandas.GeoDataFrame( @@ -2932,6 +2968,10 @@ def to_geodataframe( progress_bar_type=None, create_bqstorage_client=True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2941,6 +2981,10 @@ def to_geodataframe( progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. geography_column (str): Ignored. Added for compatibility with RowIterator. + bool_dtype (Any): Ignored. Added for compatibility with RowIterator. + int_dtype (Any): Ignored. Added for compatibility with RowIterator. + float_dtype (Any): Ignored. Added for compatibility with RowIterator. + string_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/noxfile.py b/noxfile.py index c2b4bbb50..1922a68a5 100644 --- a/noxfile.py +++ b/noxfile.py @@ -109,9 +109,7 @@ def default(session, install_extras=True): # that logic (and the associated tests) we avoid installing the [ipython] extra # which has a downstream effect of then avoiding installing bigquery_magics. if install_extras and session.python == UNIT_TEST_PYTHON_VERSIONS[0]: - install_target = ( - ".[bqstorage,pandas,ipywidgets,geopandas,tqdm,opentelemetry,bigquery_v2]" - ) + install_target = ".[bqstorage,pandas,ipywidgets,geopandas,matplotlib,tqdm,opentelemetry,bigquery_v2]" elif install_extras: # run against all other UNIT_TEST_PYTHON_VERSIONS install_target = ".[all]" else: diff --git a/pyproject.toml b/pyproject.toml index 38d74cdd0..9c91a2fc8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,11 @@ pandas = [ ipywidgets = ["ipywidgets >= 7.7.1", "ipykernel >= 6.2.0"] geopandas = ["geopandas >= 0.9.0, < 2.0.0", "Shapely >= 1.8.4, < 3.0.0"] ipython = ["ipython >= 7.23.1", "bigquery-magics >= 0.6.0"] -tqdm = ["tqdm >= 4.7.4, < 5.0.0"] +matplotlib = [ + "matplotlib >= 3.7.1, <= 3.9.2; python_version == '3.9'", + "matplotlib >= 3.10.3; python_version >= '3.10'", +] +tqdm = ["tqdm >= 4.23.4, < 5.0.0"] opentelemetry = [ "opentelemetry-api >= 1.1.0", "opentelemetry-sdk >= 1.1.0", @@ -93,7 +97,7 @@ bigquery_v2 = [ "protobuf >= 3.20.2, < 7.0.0, != 4.21.0, != 4.21.1, != 4.21.2, != 4.21.3, != 4.21.4, != 4.21.5", # For the legacy proto-based types. ] all = [ - "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,tqdm,opentelemetry,bigquery_v2]", + "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,matplotlib,tqdm,opentelemetry,bigquery_v2]", ] [tool.setuptools.dynamic] diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index cb6c29f3b..60a155f0d 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -29,4 +29,4 @@ pyarrow==4.0.0 python-dateutil==2.8.2 requests==2.21.0 Shapely==1.8.4 -tqdm==4.7.4 +matplotlib==3.7.1 diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 2cda59bd1..d82f0dfe3 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -22,6 +22,7 @@ from ..helpers import make_connection from .helpers import _make_client from .helpers import _make_job_resource +from google.cloud.bigquery.enums import DefaultPandasDTypes try: from google.cloud import bigquery_storage @@ -30,6 +31,7 @@ except (ImportError, AttributeError): bigquery_storage = None + try: import shapely except (ImportError, AttributeError): @@ -1019,5 +1021,9 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, ) assert df is row_iterator.to_geodataframe.return_value diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 253006547..8daa4ce43 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -31,6 +31,7 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import external_config from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.dataset import DatasetReference @@ -4065,7 +4066,7 @@ def test_to_dataframe_no_tqdm(self): def test_to_dataframe_tqdm_error(self): pytest.importorskip("pandas") - pytest.importorskip("tqdm") + tqdm = pytest.importorskip("tqdm") mock.patch("tqdm.tqdm_gui", new=None) mock.patch("tqdm.notebook.tqdm", new=None) mock.patch("tqdm.tqdm", new=None) @@ -4100,7 +4101,7 @@ def test_to_dataframe_tqdm_error(self): for warning in warned: # pragma: NO COVER self.assertIn( warning.category, - [UserWarning, DeprecationWarning], + [UserWarning, DeprecationWarning, tqdm.TqdmExperimentalWarning], ) def test_to_dataframe_w_empty_results(self): @@ -5639,6 +5640,10 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, ) self.assertIsInstance(df, geopandas.GeoDataFrame) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 94737732b..43d64d77d 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -261,3 +261,106 @@ def test_to_dataframe_with_jobs_query_response(class_under_test): "Tiffani", ] assert list(df["number"]) == [6, 325, 26, 10, 17, 22, 6, 229, 8] + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_default_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (1 2)"], + "bool_col": [True], + "int_col": [123], + "float_col": [1.23], + "string_col": ["abc"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + rows.to_geodataframe(geography_column="geo_col") + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=bigquery.enums.DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=bigquery.enums.DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + ) + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_custom_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (3 4)"], + "bool_col": [False], + "int_col": [456], + "float_col": [4.56], + "string_col": ["def"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + custom_bool_dtype = "bool" + custom_int_dtype = "int32" + custom_float_dtype = "float32" + custom_string_dtype = "string" + + rows.to_geodataframe( + geography_column="geo_col", + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + )