2. Polars

[115]:

import polars as pl

[116]:

df = pl.read_csv("https://raw.githubusercontent.com/ms-robot-please/Python-for-Data-Science/master/iris.csv")

[117]:

df.head()

[117]:

shape: (5, 6)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
i64	f64	f64	f64	f64	str
1	5.1	3.5	1.4	0.2	"setosa"
2	4.9	3.0	1.4	0.2	"setosa"
3	4.7	3.2	1.3	0.2	"setosa"
4	4.6	3.1	1.5	0.2	"setosa"
5	5.0	3.6	1.4	0.2	"setosa"

2.1. Simple operations

[118]:

# Multiply a column by 10
df.with_columns(pl.col("Sepal.Width")*10).head()

[118]:

shape: (5, 6)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
i64	f64	f64	f64	f64	str
1	5.1	35.0	1.4	0.2	"setosa"
2	4.9	30.0	1.4	0.2	"setosa"
3	4.7	32.0	1.3	0.2	"setosa"
4	4.6	31.0	1.5	0.2	"setosa"
5	5.0	36.0	1.4	0.2	"setosa"

[119]:

# Multiply two column by 10
df.with_columns(pl.col(["Sepal.Width",      "Petal.Length"])*10).head()

[119]:

shape: (5, 6)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
i64	f64	f64	f64	f64	str
1	5.1	35.0	14.0	0.2	"setosa"
2	4.9	30.0	14.0	0.2	"setosa"
3	4.7	32.0	13.0	0.2	"setosa"
4	4.6	31.0	15.0	0.2	"setosa"
5	5.0	36.0	14.0	0.2	"setosa"

[120]:

# A more arbitrary function with apply
df.with_columns(pl.col("Sepal.Width").apply(lambda x: x if x < 3.4 else 3.4)).head()

[120]:

shape: (5, 6)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
i64	f64	f64	f64	f64	str
1	5.1	3.4	1.4	0.2	"setosa"
2	4.9	3.0	1.4	0.2	"setosa"
3	4.7	3.2	1.3	0.2	"setosa"
4	4.6	3.1	1.5	0.2	"setosa"
5	5.0	3.4	1.4	0.2	"setosa"

[121]:

# Same as before, but now with a new column name
df.with_columns(pl.col("Sepal.Width").alias("Sepal.Width.Capped").apply(lambda x: x if x < 3.4 else 3.4)).head()

[121]:

shape: (5, 7)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species	Sepal.Width.Capped
i64	f64	f64	f64	f64	str	f64
1	5.1	3.5	1.4	0.2	"setosa"	3.4
2	4.9	3.0	1.4	0.2	"setosa"	3.0
3	4.7	3.2	1.3	0.2	"setosa"	3.2
4	4.6	3.1	1.5	0.2	"setosa"	3.1
5	5.0	3.6	1.4	0.2	"setosa"	3.4

[122]:

# Same as before, but now we select only the two columns (select instead of with_columns)
df.select([pl.col("Sepal.Width"), pl.col("Sepal.Width").alias("Sepal.Width.Capped").apply(lambda x: x if x < 3.4 else 3.4)]).head()

[122]:

shape: (5, 2)

Sepal.Width	Sepal.Width.Capped
f64	f64
3.5	3.4
3.0	3.0
3.2	3.2
3.1	3.1
3.6	3.4

[123]:

# Filtering
df.filter(pl.col("Petal.Length")<=1.5).head()

[123]:

shape: (5, 6)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
i64	f64	f64	f64	f64	str
1	5.1	3.5	1.4	0.2	"setosa"
2	4.9	3.0	1.4	0.2	"setosa"
3	4.7	3.2	1.3	0.2	"setosa"
4	4.6	3.1	1.5	0.2	"setosa"
5	5.0	3.6	1.4	0.2	"setosa"

[124]:

# Now playing a little bit with multiple operations
df.with_columns(
    pl.col("Petal.Length").filter(pl.col("Sepal.Length")<=5.0).sum().alias("another_column")
).head()

[124]:

shape: (5, 7)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species	another_column
i64	f64	f64	f64	f64	str	f64
1	5.1	3.5	1.4	0.2	"setosa"	54.5
2	4.9	3.0	1.4	0.2	"setosa"	54.5
3	4.7	3.2	1.3	0.2	"setosa"	54.5
4	4.6	3.1	1.5	0.2	"setosa"	54.5
5	5.0	3.6	1.4	0.2	"setosa"	54.5

[125]:

# Sorting by Sepal.Length and as then Sepal.Width as tiebreaker
# By default, sorting is ascending, so,
# we need to specify descending=True if don't want that
df.sort(["Sepal.Length", "Sepal.Width"], descending=True).head()

[125]:

shape: (5, 6)

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
i64	f64	f64	f64	f64	str
132	7.9	3.8	6.4	2.0	"virginica"
118	7.7	3.8	6.7	2.2	"virginica"
136	7.7	3.0	6.1	2.3	"virginica"
123	7.7	2.8	6.7	2.0	"virginica"
119	7.7	2.6	6.9	2.3	"virginica"

[126]:

# Remove duplicates with combination of Sepal.Length and Sepal.Width
# And then remove duplicates in column Sepal.Length
df.shape, df.unique(["Sepal.Length", "Sepal.Width"]).shape, df.unique("Sepal.Length").shape

[126]:

((150, 6), (117, 6), (35, 6))

2.2. Group By

[127]:

# Agg expects an expression in similar fashion to select
df.groupby("Species").agg(pl.col("Sepal.Width"))

[127]:

shape: (3, 2)

Species	Sepal.Width
str	list[f64]
"setosa"	[3.5, 3.0, … 3.3]
"versicolor"	[3.2, 3.2, … 2.8]
"virginica"	[3.3, 2.7, … 3.0]

[128]:

# Get the sum of the Sepal.Width per Species
df.groupby("Species").agg(pl.sum("Sepal.Width"))
# this will also work: df.groupby("Species").agg(pl.col("Sepal.Width").sum())

[128]:

shape: (3, 2)

Species	Sepal.Width
str	f64
"virginica"	148.7
"setosa"	171.4
"versicolor"	138.5

[129]:

# Get the sum of the Sepal.Width squared per Species
df.groupby("Species").agg((pl.col("Sepal.Width")**2).sum())

[129]:

shape: (3, 2)

Species	Sepal.Width
str	f64
"setosa"	594.6
"versicolor"	388.47
"virginica"	447.33

[130]:

# Now operations with two columns
df.groupby("Species").agg((
    (pl.col("Sepal.Width").sum()**2).alias("Sepal.Width.Squared.of.sums"),
    (pl.col("Sepal.Width")**2).sum().alias("Sepal.Width.Sum.of.squares"),
))

[130]:

shape: (3, 3)

Species	Sepal.Width.Squared.of.sums	Sepal.Width.Sum.of.squares
str	f64	f64
"virginica"	22111.69	447.33
"setosa"	29377.96	594.6
"versicolor"	19182.25	388.47

[131]:

# We can play with the results using with_columns as before
# as the result of agg is a DataFrame
df.groupby("Species").agg((
    (pl.col("Sepal.Width")**2).mean().alias("sq_mean"),
    (pl.col("Sepal.Width").mean()**2).alias("mean_sq"),
)).select((
    pl.col("Species"),
    (pl.col("sq_mean") - pl.col("mean_sq")).alias("Variance")
))

[131]:

shape: (3, 2)

Species	Variance
str	f64
"virginica"	0.101924
"versicolor"	0.0965
"setosa"	0.140816

[132]:

# Note that there is a simpler way to get Variance
df.groupby("Species").agg(
    pl.col("Sepal.Width").var().alias("Variance")
)

[132]:

shape: (3, 2)

Species	Variance
str	f64
"versicolor"	0.098469
"setosa"	0.14369
"virginica"	0.104004

[133]:

# Apply also works with groupby, however, it is a little more tricky,
# because it needs to be a function that takes a DataFrame and returns a DataFrame
# Every time the function is called, it will received a DataFrame with filtered by
# the group value, and the dataset will include the group column itself.
dfn = pl.DataFrame(
    {
        "a": ["a", "b", "a", "c", "b", "c", "b", "c", "b"],
        "b": range(9),
    }
)
counter = [0]
def func(x):
    print("Call number", counter[0])
    counter[0] += 1
    print("Input:", x)
    return x[:2]
#
# dfn.sort(dfn.columns).groupby("a").apply(func)

[134]:

# We can also use apply inside an agg
df.groupby("Species").agg(
    pl.col("Sepal.Width").apply(lambda x: (x**2).sum())
)

[134]:

shape: (3, 2)

Species	Sepal.Width
str	f64
"setosa"	594.6
"versicolor"	388.47
"virginica"	447.33

[135]:

# over() is an operation that allows to do aggregations over the whole dataset
# with the difference that the whole dataset is still returned
# (similar to doing a ROW OVER in SQL)
res_gb = df.groupby("Species").agg(pl.col("Petal.Width").mean().suffix("_mean")).head()
res_over = df.with_columns(pl.col("Petal.Width").mean().over("Species").suffix("_mean"))
print(res_gb)
print(res_over.shape)
print(res_over[0:2])
print(res_over.filter(pl.col("Species") == "virginica")[0:2])

shape: (3, 2)
┌────────────┬──────────────────┐
│ Species    ┆ Petal.Width_mean │
│ ---        ┆ ---              │
│ str        ┆ f64              │
╞════════════╪══════════════════╡
│ virginica  ┆ 2.026            │
│ versicolor ┆ 1.326            │
│ setosa     ┆ 0.246            │
└────────────┴──────────────────┘
(150, 7)
shape: (2, 7)
┌─────┬──────────────┬─────────────┬──────────────┬─────────────┬─────────┬──────────────────┐
│     ┆ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species ┆ Petal.Width_mean │
│ --- ┆ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---     ┆ ---              │
│ i64 ┆ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str     ┆ f64              │
╞═════╪══════════════╪═════════════╪══════════════╪═════════════╪═════════╪══════════════════╡
│ 1   ┆ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa  ┆ 0.246            │
│ 2   ┆ 4.9          ┆ 3.0         ┆ 1.4          ┆ 0.2         ┆ setosa  ┆ 0.246            │
└─────┴──────────────┴─────────────┴──────────────┴─────────────┴─────────┴──────────────────┘
shape: (2, 7)
┌─────┬──────────────┬─────────────┬──────────────┬─────────────┬───────────┬──────────────────┐
│     ┆ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   ┆ Petal.Width_mean │
│ --- ┆ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       ┆ ---              │
│ i64 ┆ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       ┆ f64              │
╞═════╪══════════════╪═════════════╪══════════════╪═════════════╪═══════════╪══════════════════╡
│ 101 ┆ 6.3          ┆ 3.3         ┆ 6.0          ┆ 2.5         ┆ virginica ┆ 2.026            │
│ 102 ┆ 5.8          ┆ 2.7         ┆ 5.1          ┆ 1.9         ┆ virginica ┆ 2.026            │
└─────┴──────────────┴─────────────┴──────────────┴─────────────┴───────────┴──────────────────┘

2.3. Resampling

Imagine that we are given a timeseries dataset and need to resample it to weekly data

[153]:

dft = pl.DataFrame(
    {
        "date": [
            "05/10/2022", "02/28/2022", "07/23/2022", "03/01/2022",
            "08/10/2022", "02/22/2022", "05/17/2022", "04/09/2022",
            "02/10/2022", "05/28/2022", "03/23/2022", "07/03/2022",
            "03/13/2022", "08/03/2022", "04/03/2022", "04/19/2022",
        ],
        "val": range(16),
        "description": [
            "a", "b", "c", "d",
            "e", "f", "g", "h",
            "i", "j", "k", "l",
            "m", "n", "o", "p",
        ]
    }
)

[154]:

# First let's parse the date column into a date type
print(dft.dtypes)
dft = dft.with_columns(pl.col("date").str.strptime(pl.Date, format="%m/%d/%Y"))
print(dft.dtypes)
dft.sort("date").head()

[Utf8, Int64, Utf8]
[Date, Int64, Utf8]

[154]:

shape: (5, 3)

date	val	description
date	i64	str
2022-02-10	8	"i"
2022-02-22	5	"f"
2022-02-28	1	"b"
2022-03-01	3	"d"
2022-03-13	12	"m"

[155]:

# Now we upsample it to daily data
# Note that we need to sort the date first
dft_daily = dft.sort("date").upsample("date", every="1d").interpolate().fill_null(strategy="forward")

# And then we downsample it to weekly data
dft_weekly = dft[::7]
dft_weekly.head()

[155]:

shape: (3, 3)

date	val	description
date	i64	str
2022-05-10	0	"a"
2022-04-09	7	"h"
2022-04-03	14	"o"

[156]:

# Additionally, we could calculate an offset to
# have the weekly data on always Mondays
offset = 7 - dft_daily['date'][0].weekday()
offset = offset if offset != 7 else 0
dft_weekly = dft_daily[offset:][::7]
dft_weekly.head()

[156]:

shape: (5, 3)

date	val	description
date	i64	str
2022-02-14	7	"i"
2022-02-21	6	"i"
2022-02-28	1	"b"
2022-03-07	7	"d"
2022-03-14	12	"m"