import polars as pl
import polars.selectors as cs
import numpy as np
import uuid
import polars_uuid4
pl.__version__
'0.20.7'
df = pl.DataFrame({
'Random numbers': np.random.rand(10000000),
'A string column': "value",
}).with_row_index()
df.tail()
<style>
.dataframe > thead > tr,
.dataframe > tbody > tr {
text-align: right;
white-space: pre-wrap;
}
</style>
shape: (5, 3)
index | Random numbers | A string column |
---|---|---|
u32 | f64 | str |
9999995 | 0.342875 | "value" |
9999996 | 0.283626 | "value" |
9999997 | 0.91639 | "value" |
9999998 | 0.299616 | "value" |
9999999 | 0.460211 | "value" |
- with_uuid4() accepts a variable so you can set the name of the series, defaults to uuid
df.uuid.with_uuid4()
<style>
.dataframe > thead > tr,
.dataframe > tbody > tr {
text-align: right;
white-space: pre-wrap;
}
</style>
shape: (10_000_000, 4)
index | Random numbers | A string column | uuid |
---|---|---|---|
u32 | f64 | str | str |
0 | 0.431903 | "value" | "{57cfa3fd-01a5… |
1 | 0.198707 | "value" | "{3e418a42-db42… |
2 | 0.626431 | "value" | "{1e16aeb2-0675… |
3 | 0.790102 | "value" | "{e1129c0a-38e1… |
4 | 0.907382 | "value" | "{8ad58341-ab23… |
5 | 0.995303 | "value" | "{83ed9d53-30a5… |
6 | 0.998931 | "value" | "{2ce35a0f-9981… |
7 | 0.836289 | "value" | "{655d0891-0f1b… |
8 | 0.872352 | "value" | "{77fec4e7-1a23… |
9 | 0.529137 | "value" | "{912c7ff7-0a12… |
10 | 0.322931 | "value" | "{d6402d0d-b5ab… |
11 | 0.456256 | "value" | "{26c89cc9-d740… |
… | … | … | … |
9999988 | 0.006378 | "value" | "{ddc657e6-2fa7… |
9999989 | 0.50514 | "value" | "{3a7f87a4-23de… |
9999990 | 0.708277 | "value" | "{b51b0665-32a0… |
9999991 | 0.743679 | "value" | "{5fe2070b-9d4c… |
9999992 | 0.937289 | "value" | "{11b6f029-6d44… |
9999993 | 0.763785 | "value" | "{44b87135-d0a7… |
9999994 | 0.913705 | "value" | "{9127c91c-2a4f… |
9999995 | 0.342875 | "value" | "{4dcc6d5e-97da… |
9999996 | 0.283626 | "value" | "{3b34e5ff-1047… |
9999997 | 0.91639 | "value" | "{d32b1a17-50ba… |
9999998 | 0.299616 | "value" | "{71ad3545-fe92… |
9999999 | 0.460211 | "value" | "{5ca39c0a-9993… |
df = pl.LazyFrame({
'Random numbers': np.random.rand(10000000),
'A string column': "value",
}).with_row_index().uuid.with_uuid4().collect()
df.tail()
<style>
.dataframe > thead > tr,
.dataframe > tbody > tr {
text-align: right;
white-space: pre-wrap;
}
</style>
shape: (5, 4)
index | Random numbers | A string column | uuid |
---|---|---|---|
u32 | f64 | str | str |
9999995 | 0.185959 | "value" | "{c4baf1ce-98c5… |
9999996 | 0.005801 | "value" | "{172ddf3c-ea9b… |
9999997 | 0.606094 | "value" | "{3dc75c0d-19fd… |
9999998 | 0.268984 | "value" | "{f9a4f709-a2e9… |
9999999 | 0.22677 | "value" | "{75f6c83d-a693… |
- Gets job done. Creates a UUID4 for each row.
- Uses python uuid module.
- Takes a long time (in the polars world).
- 20.7 s ± 91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
uuids = ["{"+str(uuid.uuid4())+"}" for i in range(len(df))]
uuid_series = pl.Series(name="python_UUID", values=uuids)
df.with_columns(
uuid_series
)
20.4 s ± 160 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- Gets job done. Creates a UUID4 for each row.
- Uses rust uuid crate.
- Much easier to understand/simpler code.
- ~ 40x faster than using python's uuid module to generate UUID4 when the last column in the df is already a string
- 512 ms ± 6.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
df.uuid.with_uuid4()
512 ms ± 6.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- 644 ms ± 6.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = pl.DataFrame({
'Random numbers': np.random.rand(10000000),
}).with_row_index()
df.tail()
<style>
.dataframe > thead > tr,
.dataframe > tbody > tr {
text-align: right;
white-space: pre-wrap;
}
</style>
shape: (5, 2)
index | Random numbers |
---|---|
u32 | f64 |
9999995 | 0.313362 |
9999996 | 0.679717 |
9999997 | 0.076164 |
9999998 | 0.853126 |
9999999 | 0.892428 |
%%timeit
df.uuid.with_uuid4()
644 ms ± 6.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)