@@ -58,7 +58,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
|
58 | 58 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
59 | 59 | )
|
60 | 60 |
|
61 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 61 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
62 | 62 |
|
63 | 63 |
|
64 | 64 | def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
|
@@ -82,7 +82,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
|
82 | 82 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
83 | 83 | )
|
84 | 84 |
|
85 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 85 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
86 | 86 |
|
87 | 87 |
|
88 | 88 | def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
|
@@ -110,7 +110,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
|
110 | 110 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
111 | 111 | )
|
112 | 112 |
|
113 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 113 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
114 | 114 |
|
115 | 115 |
|
116 | 116 | def test_standard_scaler_save_load(new_penguins_df, dataset_id):
|
@@ -125,6 +125,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id):
|
125 | 125 | assert isinstance(reloaded_transformer, preprocessing.StandardScaler)
|
126 | 126 | assert reloaded_transformer._bqml_model is not None
|
127 | 127 |
|
| 128 | +result = reloaded_transformer.transform( |
| 129 | +new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] |
| 130 | +).to_pandas() |
| 131 | + |
| 132 | +expected = pd.DataFrame( |
| 133 | +{ |
| 134 | +"standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], |
| 135 | +"standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], |
| 136 | +"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], |
| 137 | +}, |
| 138 | +dtype="Float64", |
| 139 | +index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), |
| 140 | +) |
| 141 | + |
| 142 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
| 143 | + |
128 | 144 |
|
129 | 145 | def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
|
130 | 146 | # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod.
|
@@ -157,7 +173,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
|
157 | 173 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
158 | 174 | )
|
159 | 175 |
|
160 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 176 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
161 | 177 |
|
162 | 178 |
|
163 | 179 | def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
|
@@ -176,7 +192,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
|
176 | 192 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
177 | 193 | )
|
178 | 194 |
|
179 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 195 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
180 | 196 |
|
181 | 197 |
|
182 | 198 | def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
|
@@ -199,7 +215,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
|
199 | 215 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
200 | 216 | )
|
201 | 217 |
|
202 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 218 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
203 | 219 |
|
204 | 220 |
|
205 | 221 | def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
|
@@ -214,6 +230,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
|
214 | 230 | assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler)
|
215 | 231 | assert reloaded_transformer._bqml_model is not None
|
216 | 232 |
|
| 233 | +result = reloaded_transformer.transform( |
| 234 | +new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] |
| 235 | +).to_pandas() |
| 236 | + |
| 237 | +expected = pd.DataFrame( |
| 238 | +{ |
| 239 | +"max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], |
| 240 | +"max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], |
| 241 | +"max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], |
| 242 | +}, |
| 243 | +dtype="Float64", |
| 244 | +index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), |
| 245 | +) |
| 246 | + |
| 247 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
| 248 | + |
217 | 249 |
|
218 | 250 | def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
|
219 | 251 | scaler = preprocessing.MinMaxScaler()
|
@@ -231,7 +263,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
|
231 | 263 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
232 | 264 | )
|
233 | 265 |
|
234 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 266 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
235 | 267 |
|
236 | 268 |
|
237 | 269 | def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
|
@@ -255,7 +287,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin
|
255 | 287 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
256 | 288 | )
|
257 | 289 |
|
258 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 290 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
259 | 291 |
|
260 | 292 |
|
261 | 293 | def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
|
@@ -290,7 +322,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
|
290 | 322 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
291 | 323 | )
|
292 | 324 |
|
293 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 325 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
294 | 326 |
|
295 | 327 |
|
296 | 328 | def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
|
@@ -305,6 +337,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
|
305 | 337 | assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler)
|
306 | 338 | assert reloaded_transformer._bqml_model is not None
|
307 | 339 |
|
| 340 | +result = reloaded_transformer.fit_transform( |
| 341 | +new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] |
| 342 | +).to_pandas() |
| 343 | + |
| 344 | +expected = pd.DataFrame( |
| 345 | +{ |
| 346 | +"min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], |
| 347 | +"min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], |
| 348 | +"min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], |
| 349 | +}, |
| 350 | +dtype="Float64", |
| 351 | +index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), |
| 352 | +) |
| 353 | + |
| 354 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
| 355 | + |
308 | 356 |
|
309 | 357 | def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df):
|
310 | 358 | discretizer = preprocessing.KBinsDiscretizer(strategy="uniform")
|
@@ -322,7 +370,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
|
322 | 370 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
323 | 371 | )
|
324 | 372 |
|
325 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 373 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
326 | 374 |
|
327 | 375 |
|
328 | 376 | def test_k_bins_discretizer_series_normalizes(
|
@@ -344,7 +392,7 @@ def test_k_bins_discretizer_series_normalizes(
|
344 | 392 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
345 | 393 | )
|
346 | 394 |
|
347 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 395 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
348 | 396 |
|
349 | 397 |
|
350 | 398 | def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
|
@@ -374,7 +422,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d
|
374 | 422 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
375 | 423 | )
|
376 | 424 |
|
377 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 425 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
378 | 426 |
|
379 | 427 |
|
380 | 428 | def test_k_bins_discretizer_normalizes_different_params(
|
@@ -406,7 +454,7 @@ def test_k_bins_discretizer_normalizes_different_params(
|
406 | 454 | index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
|
407 | 455 | )
|
408 | 456 |
|
409 |
| -pd.testing.assert_frame_equal(result, expected, rtol=1e-3) |
| 457 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
410 | 458 |
|
411 | 459 |
|
412 | 460 | def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
|
@@ -423,6 +471,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
|
423 | 471 | assert reloaded_transformer.strategy == transformer.strategy
|
424 | 472 | assert reloaded_transformer._bqml_model is not None
|
425 | 473 |
|
| 474 | +result = reloaded_transformer.fit_transform( |
| 475 | +new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] |
| 476 | +).to_pandas() |
| 477 | + |
| 478 | +expected = pd.DataFrame( |
| 479 | +{ |
| 480 | +"kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"], |
| 481 | +"kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"], |
| 482 | +"kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"], |
| 483 | +}, |
| 484 | +dtype="string[pyarrow]", |
| 485 | +index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), |
| 486 | +) |
| 487 | + |
| 488 | +pd.testing.assert_frame_equal(result, expected, rtol=0.1) |
| 489 | + |
426 | 490 |
|
427 | 491 | def test_one_hot_encoder_default_params(new_penguins_df):
|
428 | 492 | encoder = preprocessing.OneHotEncoder()
|
@@ -560,6 +624,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id):
|
560 | 624 | assert reloaded_transformer.max_categories == transformer.max_categories
|
561 | 625 | assert reloaded_transformer._bqml_model is not None
|
562 | 626 |
|
| 627 | +result = reloaded_transformer.fit_transform( |
| 628 | +new_penguins_df[["species", "sex"]] |
| 629 | +).to_pandas() |
| 630 | + |
| 631 | +expected = pd.DataFrame( |
| 632 | +{ |
| 633 | +"onehotencoded_species": [ |
| 634 | +[{"index": 1, "value": 1.0}], |
| 635 | +[{"index": 1, "value": 1.0}], |
| 636 | +[{"index": 2, "value": 1.0}], |
| 637 | +], |
| 638 | +"onehotencoded_sex": [ |
| 639 | +[{"index": 2, "value": 1.0}], |
| 640 | +[{"index": 1, "value": 1.0}], |
| 641 | +[{"index": 1, "value": 1.0}], |
| 642 | +], |
| 643 | +}, |
| 644 | +dtype=ONE_HOT_ENCODED_DTYPE, |
| 645 | +index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), |
| 646 | +) |
| 647 | + |
| 648 | +pd.testing.assert_frame_equal(result, expected) |
| 649 | + |
563 | 650 |
|
564 | 651 | def test_label_encoder_default_params(new_penguins_df):
|
565 | 652 | encoder = preprocessing.LabelEncoder()
|
@@ -677,5 +764,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
|
677 | 764 | assert reloaded_transformer.max_categories == transformer.max_categories
|
678 | 765 | assert reloaded_transformer._bqml_model is not None
|
679 | 766 |
|
| 767 | +result = reloaded_transformer.transform(new_penguins_df).to_pandas() |
| 768 | + |
| 769 | +expected = pd.DataFrame( |
| 770 | +{ |
| 771 | +"labelencoded_species": [ |
| 772 | +1, |
| 773 | +1, |
| 774 | +2, |
| 775 | +], |
| 776 | +}, |
| 777 | +dtype="Int64", |
| 778 | +index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), |
| 779 | +) |
| 780 | + |
| 781 | +pd.testing.assert_frame_equal(result, expected) |
| 782 | + |
680 | 783 |
|
681 | 784 | # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.
|
0 commit comments