2. Polars
[115]:
import polars as pl
[116]:
df = pl.read_csv("https://raw.githubusercontent.com/ms-robot-please/Python-for-Data-Science/master/iris.csv")
[117]:
df.head()
[117]:
shape: (5, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str |
1 | 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
2 | 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
3 | 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4 | 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5 | 5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
2.1. Simple operations
[118]:
# Multiply a column by 10
df.with_columns(pl.col("Sepal.Width")*10).head()
[118]:
shape: (5, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str |
1 | 5.1 | 35.0 | 1.4 | 0.2 | "setosa" |
2 | 4.9 | 30.0 | 1.4 | 0.2 | "setosa" |
3 | 4.7 | 32.0 | 1.3 | 0.2 | "setosa" |
4 | 4.6 | 31.0 | 1.5 | 0.2 | "setosa" |
5 | 5.0 | 36.0 | 1.4 | 0.2 | "setosa" |
[119]:
# Multiply two column by 10
df.with_columns(pl.col(["Sepal.Width", "Petal.Length"])*10).head()
[119]:
shape: (5, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str |
1 | 5.1 | 35.0 | 14.0 | 0.2 | "setosa" |
2 | 4.9 | 30.0 | 14.0 | 0.2 | "setosa" |
3 | 4.7 | 32.0 | 13.0 | 0.2 | "setosa" |
4 | 4.6 | 31.0 | 15.0 | 0.2 | "setosa" |
5 | 5.0 | 36.0 | 14.0 | 0.2 | "setosa" |
[120]:
# A more arbitrary function with apply
df.with_columns(pl.col("Sepal.Width").apply(lambda x: x if x < 3.4 else 3.4)).head()
[120]:
shape: (5, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str |
1 | 5.1 | 3.4 | 1.4 | 0.2 | "setosa" |
2 | 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
3 | 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4 | 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5 | 5.0 | 3.4 | 1.4 | 0.2 | "setosa" |
[121]:
# Same as before, but now with a new column name
df.with_columns(pl.col("Sepal.Width").alias("Sepal.Width.Capped").apply(lambda x: x if x < 3.4 else 3.4)).head()
[121]:
shape: (5, 7)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | Sepal.Width.Capped | |
---|---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str | f64 |
1 | 5.1 | 3.5 | 1.4 | 0.2 | "setosa" | 3.4 |
2 | 4.9 | 3.0 | 1.4 | 0.2 | "setosa" | 3.0 |
3 | 4.7 | 3.2 | 1.3 | 0.2 | "setosa" | 3.2 |
4 | 4.6 | 3.1 | 1.5 | 0.2 | "setosa" | 3.1 |
5 | 5.0 | 3.6 | 1.4 | 0.2 | "setosa" | 3.4 |
[122]:
# Same as before, but now we select only the two columns (select instead of with_columns)
df.select([pl.col("Sepal.Width"), pl.col("Sepal.Width").alias("Sepal.Width.Capped").apply(lambda x: x if x < 3.4 else 3.4)]).head()
[122]:
shape: (5, 2)
Sepal.Width | Sepal.Width.Capped |
---|---|
f64 | f64 |
3.5 | 3.4 |
3.0 | 3.0 |
3.2 | 3.2 |
3.1 | 3.1 |
3.6 | 3.4 |
[123]:
# Filtering
df.filter(pl.col("Petal.Length")<=1.5).head()
[123]:
shape: (5, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str |
1 | 5.1 | 3.5 | 1.4 | 0.2 | "setosa" |
2 | 4.9 | 3.0 | 1.4 | 0.2 | "setosa" |
3 | 4.7 | 3.2 | 1.3 | 0.2 | "setosa" |
4 | 4.6 | 3.1 | 1.5 | 0.2 | "setosa" |
5 | 5.0 | 3.6 | 1.4 | 0.2 | "setosa" |
[124]:
# Now playing a little bit with multiple operations
df.with_columns(
pl.col("Petal.Length").filter(pl.col("Sepal.Length")<=5.0).sum().alias("another_column")
).head()
[124]:
shape: (5, 7)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | another_column | |
---|---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str | f64 |
1 | 5.1 | 3.5 | 1.4 | 0.2 | "setosa" | 54.5 |
2 | 4.9 | 3.0 | 1.4 | 0.2 | "setosa" | 54.5 |
3 | 4.7 | 3.2 | 1.3 | 0.2 | "setosa" | 54.5 |
4 | 4.6 | 3.1 | 1.5 | 0.2 | "setosa" | 54.5 |
5 | 5.0 | 3.6 | 1.4 | 0.2 | "setosa" | 54.5 |
[125]:
# Sorting by Sepal.Length and as then Sepal.Width as tiebreaker
# By default, sorting is ascending, so,
# we need to specify descending=True if don't want that
df.sort(["Sepal.Length", "Sepal.Width"], descending=True).head()
[125]:
shape: (5, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
i64 | f64 | f64 | f64 | f64 | str |
132 | 7.9 | 3.8 | 6.4 | 2.0 | "virginica" |
118 | 7.7 | 3.8 | 6.7 | 2.2 | "virginica" |
136 | 7.7 | 3.0 | 6.1 | 2.3 | "virginica" |
123 | 7.7 | 2.8 | 6.7 | 2.0 | "virginica" |
119 | 7.7 | 2.6 | 6.9 | 2.3 | "virginica" |
[126]:
# Remove duplicates with combination of Sepal.Length and Sepal.Width
# And then remove duplicates in column Sepal.Length
df.shape, df.unique(["Sepal.Length", "Sepal.Width"]).shape, df.unique("Sepal.Length").shape
[126]:
((150, 6), (117, 6), (35, 6))
2.2. Group By
[127]:
# Agg expects an expression in similar fashion to select
df.groupby("Species").agg(pl.col("Sepal.Width"))
[127]:
shape: (3, 2)
Species | Sepal.Width |
---|---|
str | list[f64] |
"setosa" | [3.5, 3.0, … 3.3] |
"versicolor" | [3.2, 3.2, … 2.8] |
"virginica" | [3.3, 2.7, … 3.0] |
[128]:
# Get the sum of the Sepal.Width per Species
df.groupby("Species").agg(pl.sum("Sepal.Width"))
# this will also work: df.groupby("Species").agg(pl.col("Sepal.Width").sum())
[128]:
shape: (3, 2)
Species | Sepal.Width |
---|---|
str | f64 |
"virginica" | 148.7 |
"setosa" | 171.4 |
"versicolor" | 138.5 |
[129]:
# Get the sum of the Sepal.Width squared per Species
df.groupby("Species").agg((pl.col("Sepal.Width")**2).sum())
[129]:
shape: (3, 2)
Species | Sepal.Width |
---|---|
str | f64 |
"setosa" | 594.6 |
"versicolor" | 388.47 |
"virginica" | 447.33 |
[130]:
# Now operations with two columns
df.groupby("Species").agg((
(pl.col("Sepal.Width").sum()**2).alias("Sepal.Width.Squared.of.sums"),
(pl.col("Sepal.Width")**2).sum().alias("Sepal.Width.Sum.of.squares"),
))
[130]:
shape: (3, 3)
Species | Sepal.Width.Squared.of.sums | Sepal.Width.Sum.of.squares |
---|---|---|
str | f64 | f64 |
"virginica" | 22111.69 | 447.33 |
"setosa" | 29377.96 | 594.6 |
"versicolor" | 19182.25 | 388.47 |
[131]:
# We can play with the results using with_columns as before
# as the result of agg is a DataFrame
df.groupby("Species").agg((
(pl.col("Sepal.Width")**2).mean().alias("sq_mean"),
(pl.col("Sepal.Width").mean()**2).alias("mean_sq"),
)).select((
pl.col("Species"),
(pl.col("sq_mean") - pl.col("mean_sq")).alias("Variance")
))
[131]:
shape: (3, 2)
Species | Variance |
---|---|
str | f64 |
"virginica" | 0.101924 |
"versicolor" | 0.0965 |
"setosa" | 0.140816 |
[132]:
# Note that there is a simpler way to get Variance
df.groupby("Species").agg(
pl.col("Sepal.Width").var().alias("Variance")
)
[132]:
shape: (3, 2)
Species | Variance |
---|---|
str | f64 |
"versicolor" | 0.098469 |
"setosa" | 0.14369 |
"virginica" | 0.104004 |
[133]:
# Apply also works with groupby, however, it is a little more tricky,
# because it needs to be a function that takes a DataFrame and returns a DataFrame
# Every time the function is called, it will received a DataFrame with filtered by
# the group value, and the dataset will include the group column itself.
dfn = pl.DataFrame(
{
"a": ["a", "b", "a", "c", "b", "c", "b", "c", "b"],
"b": range(9),
}
)
counter = [0]
def func(x):
print("Call number", counter[0])
counter[0] += 1
print("Input:", x)
return x[:2]
#
# dfn.sort(dfn.columns).groupby("a").apply(func)
[134]:
# We can also use apply inside an agg
df.groupby("Species").agg(
pl.col("Sepal.Width").apply(lambda x: (x**2).sum())
)
[134]:
shape: (3, 2)
Species | Sepal.Width |
---|---|
str | f64 |
"setosa" | 594.6 |
"versicolor" | 388.47 |
"virginica" | 447.33 |
[135]:
# over() is an operation that allows to do aggregations over the whole dataset
# with the difference that the whole dataset is still returned
# (similar to doing a ROW OVER in SQL)
res_gb = df.groupby("Species").agg(pl.col("Petal.Width").mean().suffix("_mean")).head()
res_over = df.with_columns(pl.col("Petal.Width").mean().over("Species").suffix("_mean"))
print(res_gb)
print(res_over.shape)
print(res_over[0:2])
print(res_over.filter(pl.col("Species") == "virginica")[0:2])
shape: (3, 2)
┌────────────┬──────────────────┐
│ Species ┆ Petal.Width_mean │
│ --- ┆ --- │
│ str ┆ f64 │
╞════════════╪══════════════════╡
│ virginica ┆ 2.026 │
│ versicolor ┆ 1.326 │
│ setosa ┆ 0.246 │
└────────────┴──────────────────┘
(150, 7)
shape: (2, 7)
┌─────┬──────────────┬─────────────┬──────────────┬─────────────┬─────────┬──────────────────┐
│ ┆ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species ┆ Petal.Width_mean │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ f64 │
╞═════╪══════════════╪═════════════╪══════════════╪═════════════╪═════════╪══════════════════╡
│ 1 ┆ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa ┆ 0.246 │
│ 2 ┆ 4.9 ┆ 3.0 ┆ 1.4 ┆ 0.2 ┆ setosa ┆ 0.246 │
└─────┴──────────────┴─────────────┴──────────────┴─────────────┴─────────┴──────────────────┘
shape: (2, 7)
┌─────┬──────────────┬─────────────┬──────────────┬─────────────┬───────────┬──────────────────┐
│ ┆ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species ┆ Petal.Width_mean │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ f64 │
╞═════╪══════════════╪═════════════╪══════════════╪═════════════╪═══════════╪══════════════════╡
│ 101 ┆ 6.3 ┆ 3.3 ┆ 6.0 ┆ 2.5 ┆ virginica ┆ 2.026 │
│ 102 ┆ 5.8 ┆ 2.7 ┆ 5.1 ┆ 1.9 ┆ virginica ┆ 2.026 │
└─────┴──────────────┴─────────────┴──────────────┴─────────────┴───────────┴──────────────────┘
2.3. Resampling
Imagine that we are given a timeseries dataset and need to resample it to weekly data
[153]:
dft = pl.DataFrame(
{
"date": [
"05/10/2022", "02/28/2022", "07/23/2022", "03/01/2022",
"08/10/2022", "02/22/2022", "05/17/2022", "04/09/2022",
"02/10/2022", "05/28/2022", "03/23/2022", "07/03/2022",
"03/13/2022", "08/03/2022", "04/03/2022", "04/19/2022",
],
"val": range(16),
"description": [
"a", "b", "c", "d",
"e", "f", "g", "h",
"i", "j", "k", "l",
"m", "n", "o", "p",
]
}
)
[154]:
# First let's parse the date column into a date type
print(dft.dtypes)
dft = dft.with_columns(pl.col("date").str.strptime(pl.Date, format="%m/%d/%Y"))
print(dft.dtypes)
dft.sort("date").head()
[Utf8, Int64, Utf8]
[Date, Int64, Utf8]
[154]:
shape: (5, 3)
date | val | description |
---|---|---|
date | i64 | str |
2022-02-10 | 8 | "i" |
2022-02-22 | 5 | "f" |
2022-02-28 | 1 | "b" |
2022-03-01 | 3 | "d" |
2022-03-13 | 12 | "m" |
[155]:
# Now we upsample it to daily data
# Note that we need to sort the date first
dft_daily = dft.sort("date").upsample("date", every="1d").interpolate().fill_null(strategy="forward")
# And then we downsample it to weekly data
dft_weekly = dft[::7]
dft_weekly.head()
[155]:
shape: (3, 3)
date | val | description |
---|---|---|
date | i64 | str |
2022-05-10 | 0 | "a" |
2022-04-09 | 7 | "h" |
2022-04-03 | 14 | "o" |
[156]:
# Additionally, we could calculate an offset to
# have the weekly data on always Mondays
offset = 7 - dft_daily['date'][0].weekday()
offset = offset if offset != 7 else 0
dft_weekly = dft_daily[offset:][::7]
dft_weekly.head()
[156]:
shape: (5, 3)
date | val | description |
---|---|---|
date | i64 | str |
2022-02-14 | 7 | "i" |
2022-02-21 | 6 | "i" |
2022-02-28 | 1 | "b" |
2022-03-07 | 7 | "d" |
2022-03-14 | 12 | "m" |