2. Polars

[115]:
import polars as pl
[116]:
df = pl.read_csv("https://raw.githubusercontent.com/ms-robot-please/Python-for-Data-Science/master/iris.csv")
[117]:
df.head()
[117]:
shape: (5, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
i64f64f64f64f64str
15.13.51.40.2"setosa"
24.93.01.40.2"setosa"
34.73.21.30.2"setosa"
44.63.11.50.2"setosa"
55.03.61.40.2"setosa"

2.1. Simple operations

[118]:
# Multiply a column by 10
df.with_columns(pl.col("Sepal.Width")*10).head()
[118]:
shape: (5, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
i64f64f64f64f64str
15.135.01.40.2"setosa"
24.930.01.40.2"setosa"
34.732.01.30.2"setosa"
44.631.01.50.2"setosa"
55.036.01.40.2"setosa"
[119]:
# Multiply two column by 10
df.with_columns(pl.col(["Sepal.Width",      "Petal.Length"])*10).head()
[119]:
shape: (5, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
i64f64f64f64f64str
15.135.014.00.2"setosa"
24.930.014.00.2"setosa"
34.732.013.00.2"setosa"
44.631.015.00.2"setosa"
55.036.014.00.2"setosa"
[120]:
# A more arbitrary function with apply
df.with_columns(pl.col("Sepal.Width").apply(lambda x: x if x < 3.4 else 3.4)).head()
[120]:
shape: (5, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
i64f64f64f64f64str
15.13.41.40.2"setosa"
24.93.01.40.2"setosa"
34.73.21.30.2"setosa"
44.63.11.50.2"setosa"
55.03.41.40.2"setosa"
[121]:
# Same as before, but now with a new column name
df.with_columns(pl.col("Sepal.Width").alias("Sepal.Width.Capped").apply(lambda x: x if x < 3.4 else 3.4)).head()
[121]:
shape: (5, 7)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpeciesSepal.Width.Capped
i64f64f64f64f64strf64
15.13.51.40.2"setosa"3.4
24.93.01.40.2"setosa"3.0
34.73.21.30.2"setosa"3.2
44.63.11.50.2"setosa"3.1
55.03.61.40.2"setosa"3.4
[122]:
# Same as before, but now we select only the two columns (select instead of with_columns)
df.select([pl.col("Sepal.Width"), pl.col("Sepal.Width").alias("Sepal.Width.Capped").apply(lambda x: x if x < 3.4 else 3.4)]).head()
[122]:
shape: (5, 2)
Sepal.WidthSepal.Width.Capped
f64f64
3.53.4
3.03.0
3.23.2
3.13.1
3.63.4
[123]:
# Filtering
df.filter(pl.col("Petal.Length")<=1.5).head()
[123]:
shape: (5, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
i64f64f64f64f64str
15.13.51.40.2"setosa"
24.93.01.40.2"setosa"
34.73.21.30.2"setosa"
44.63.11.50.2"setosa"
55.03.61.40.2"setosa"
[124]:
# Now playing a little bit with multiple operations
df.with_columns(
    pl.col("Petal.Length").filter(pl.col("Sepal.Length")<=5.0).sum().alias("another_column")
).head()
[124]:
shape: (5, 7)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpeciesanother_column
i64f64f64f64f64strf64
15.13.51.40.2"setosa"54.5
24.93.01.40.2"setosa"54.5
34.73.21.30.2"setosa"54.5
44.63.11.50.2"setosa"54.5
55.03.61.40.2"setosa"54.5
[125]:
# Sorting by Sepal.Length and as then Sepal.Width as tiebreaker
# By default, sorting is ascending, so,
# we need to specify descending=True if don't want that
df.sort(["Sepal.Length", "Sepal.Width"], descending=True).head()
[125]:
shape: (5, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
i64f64f64f64f64str
1327.93.86.42.0"virginica"
1187.73.86.72.2"virginica"
1367.73.06.12.3"virginica"
1237.72.86.72.0"virginica"
1197.72.66.92.3"virginica"
[126]:
# Remove duplicates with combination of Sepal.Length and Sepal.Width
# And then remove duplicates in column Sepal.Length
df.shape, df.unique(["Sepal.Length", "Sepal.Width"]).shape, df.unique("Sepal.Length").shape
[126]:
((150, 6), (117, 6), (35, 6))

2.2. Group By

[127]:
# Agg expects an expression in similar fashion to select
df.groupby("Species").agg(pl.col("Sepal.Width"))
[127]:
shape: (3, 2)
SpeciesSepal.Width
strlist[f64]
"setosa"[3.5, 3.0, … 3.3]
"versicolor"[3.2, 3.2, … 2.8]
"virginica"[3.3, 2.7, … 3.0]
[128]:
# Get the sum of the Sepal.Width per Species
df.groupby("Species").agg(pl.sum("Sepal.Width"))
# this will also work: df.groupby("Species").agg(pl.col("Sepal.Width").sum())
[128]:
shape: (3, 2)
SpeciesSepal.Width
strf64
"virginica"148.7
"setosa"171.4
"versicolor"138.5
[129]:
# Get the sum of the Sepal.Width squared per Species
df.groupby("Species").agg((pl.col("Sepal.Width")**2).sum())
[129]:
shape: (3, 2)
SpeciesSepal.Width
strf64
"setosa"594.6
"versicolor"388.47
"virginica"447.33
[130]:
# Now operations with two columns
df.groupby("Species").agg((
    (pl.col("Sepal.Width").sum()**2).alias("Sepal.Width.Squared.of.sums"),
    (pl.col("Sepal.Width")**2).sum().alias("Sepal.Width.Sum.of.squares"),
))
[130]:
shape: (3, 3)
SpeciesSepal.Width.Squared.of.sumsSepal.Width.Sum.of.squares
strf64f64
"virginica"22111.69447.33
"setosa"29377.96594.6
"versicolor"19182.25388.47
[131]:
# We can play with the results using with_columns as before
# as the result of agg is a DataFrame
df.groupby("Species").agg((
    (pl.col("Sepal.Width")**2).mean().alias("sq_mean"),
    (pl.col("Sepal.Width").mean()**2).alias("mean_sq"),
)).select((
    pl.col("Species"),
    (pl.col("sq_mean") - pl.col("mean_sq")).alias("Variance")
))
[131]:
shape: (3, 2)
SpeciesVariance
strf64
"virginica"0.101924
"versicolor"0.0965
"setosa"0.140816
[132]:
# Note that there is a simpler way to get Variance
df.groupby("Species").agg(
    pl.col("Sepal.Width").var().alias("Variance")
)
[132]:
shape: (3, 2)
SpeciesVariance
strf64
"versicolor"0.098469
"setosa"0.14369
"virginica"0.104004
[133]:
# Apply also works with groupby, however, it is a little more tricky,
# because it needs to be a function that takes a DataFrame and returns a DataFrame
# Every time the function is called, it will received a DataFrame with filtered by
# the group value, and the dataset will include the group column itself.
dfn = pl.DataFrame(
    {
        "a": ["a", "b", "a", "c", "b", "c", "b", "c", "b"],
        "b": range(9),
    }
)
counter = [0]
def func(x):
    print("Call number", counter[0])
    counter[0] += 1
    print("Input:", x)
    return x[:2]
#
# dfn.sort(dfn.columns).groupby("a").apply(func)
[134]:
# We can also use apply inside an agg
df.groupby("Species").agg(
    pl.col("Sepal.Width").apply(lambda x: (x**2).sum())
)
[134]:
shape: (3, 2)
SpeciesSepal.Width
strf64
"setosa"594.6
"versicolor"388.47
"virginica"447.33
[135]:
# over() is an operation that allows to do aggregations over the whole dataset
# with the difference that the whole dataset is still returned
# (similar to doing a ROW OVER in SQL)
res_gb = df.groupby("Species").agg(pl.col("Petal.Width").mean().suffix("_mean")).head()
res_over = df.with_columns(pl.col("Petal.Width").mean().over("Species").suffix("_mean"))
print(res_gb)
print(res_over.shape)
print(res_over[0:2])
print(res_over.filter(pl.col("Species") == "virginica")[0:2])
shape: (3, 2)
┌────────────┬──────────────────┐
│ Species    ┆ Petal.Width_mean │
│ ---        ┆ ---              │
│ str        ┆ f64              │
╞════════════╪══════════════════╡
│ virginica  ┆ 2.026            │
│ versicolor ┆ 1.326            │
│ setosa     ┆ 0.246            │
└────────────┴──────────────────┘
(150, 7)
shape: (2, 7)
┌─────┬──────────────┬─────────────┬──────────────┬─────────────┬─────────┬──────────────────┐
│     ┆ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species ┆ Petal.Width_mean │
│ --- ┆ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---     ┆ ---              │
│ i64 ┆ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str     ┆ f64              │
╞═════╪══════════════╪═════════════╪══════════════╪═════════════╪═════════╪══════════════════╡
│ 1   ┆ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa  ┆ 0.246            │
│ 2   ┆ 4.9          ┆ 3.0         ┆ 1.4          ┆ 0.2         ┆ setosa  ┆ 0.246            │
└─────┴──────────────┴─────────────┴──────────────┴─────────────┴─────────┴──────────────────┘
shape: (2, 7)
┌─────┬──────────────┬─────────────┬──────────────┬─────────────┬───────────┬──────────────────┐
│     ┆ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   ┆ Petal.Width_mean │
│ --- ┆ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       ┆ ---              │
│ i64 ┆ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       ┆ f64              │
╞═════╪══════════════╪═════════════╪══════════════╪═════════════╪═══════════╪══════════════════╡
│ 101 ┆ 6.3          ┆ 3.3         ┆ 6.0          ┆ 2.5         ┆ virginica ┆ 2.026            │
│ 102 ┆ 5.8          ┆ 2.7         ┆ 5.1          ┆ 1.9         ┆ virginica ┆ 2.026            │
└─────┴──────────────┴─────────────┴──────────────┴─────────────┴───────────┴──────────────────┘

2.3. Resampling

Imagine that we are given a timeseries dataset and need to resample it to weekly data

[153]:
dft = pl.DataFrame(
    {
        "date": [
            "05/10/2022", "02/28/2022", "07/23/2022", "03/01/2022",
            "08/10/2022", "02/22/2022", "05/17/2022", "04/09/2022",
            "02/10/2022", "05/28/2022", "03/23/2022", "07/03/2022",
            "03/13/2022", "08/03/2022", "04/03/2022", "04/19/2022",
        ],
        "val": range(16),
        "description": [
            "a", "b", "c", "d",
            "e", "f", "g", "h",
            "i", "j", "k", "l",
            "m", "n", "o", "p",
        ]
    }
)
[154]:
# First let's parse the date column into a date type
print(dft.dtypes)
dft = dft.with_columns(pl.col("date").str.strptime(pl.Date, format="%m/%d/%Y"))
print(dft.dtypes)
dft.sort("date").head()
[Utf8, Int64, Utf8]
[Date, Int64, Utf8]
[154]:
shape: (5, 3)
datevaldescription
datei64str
2022-02-108"i"
2022-02-225"f"
2022-02-281"b"
2022-03-013"d"
2022-03-1312"m"
[155]:
# Now we upsample it to daily data
# Note that we need to sort the date first
dft_daily = dft.sort("date").upsample("date", every="1d").interpolate().fill_null(strategy="forward")

# And then we downsample it to weekly data
dft_weekly = dft[::7]
dft_weekly.head()
[155]:
shape: (3, 3)
datevaldescription
datei64str
2022-05-100"a"
2022-04-097"h"
2022-04-0314"o"
[156]:
# Additionally, we could calculate an offset to
# have the weekly data on always Mondays
offset = 7 - dft_daily['date'][0].weekday()
offset = offset if offset != 7 else 0
dft_weekly = dft_daily[offset:][::7]
dft_weekly.head()
[156]:
shape: (5, 3)
datevaldescription
datei64str
2022-02-147"i"
2022-02-216"i"
2022-02-281"b"
2022-03-077"d"
2022-03-1412"m"