import polars as pl
import time
import matplotlib
Polars from Python and R
Pro-tip: Just swap .
(Python) for $
(R), or vice versa
Load libraries
library(polars)
Warning: package 'polars' was built under R version 4.4.1
Scan data
= pl.scan_parquet("nyc-taxi/**/*.parquet", hive_partitioning=True)
nyc nyc
NAIVE QUERY PLAN
run LazyFrame.show_graph() to see the optimized version
= pl$scan_parquet("nyc-taxi/**/*.parquet", hive_partitioning=TRUE)
nyc nyc
polars LazyFrame
$explain(): Show the optimized query plan.
Naive plan:
Parquet SCAN [nyc-taxi/year=2012/month=1/part-0.parquet, ... 11 other files]
PROJECT */24 COLUMNS
First example
Polars operations are registered as queries until they are collected.
= (
q1
nyc"passenger_count"])
.group_by([
.agg(["tip_amount")#.alias("mean_tip") ## alias is optional
pl.mean(
])"passenger_count")
.sort(
) q1
NAIVE QUERY PLAN
run LazyFrame.show_graph() to see the optimized version
= (
q1
nyc$group_by("passenger_count")
$agg(
$mean("tip_amount")#$alias("mean_tip") ## alias is optional
pl
)$sort("passenger_count")
) q1
polars LazyFrame
$explain(): Show the optimized query plan.
Naive plan:
SORT BY [col("passenger_count")]
AGGREGATE
[col("tip_amount").mean()] BY [col("passenger_count")] FROM
Parquet SCAN [nyc-taxi/year=2012/month=1/part-0.parquet, ... 11 other files]
PROJECT */24 COLUMNS
Polars-style x$method1()$method2()...
chaining may seem a little odd to R users, especially for multiline queries. Here I have adopted the same general styling as Python: By enclosing the full query in parentheses ()
, we can start each $method()
on a new line. If this isn’t to your fancy, you could also rewrite these multiline queries as follows:
$group_by(
nyc"passenger_count"
$agg(
)$mean("tip_amount")
pl$sort("passenger_count") )
(Note: this is the naive query plan, not the optimized query that polars will actually implement for us. We’ll come back to this idea shortly.)
Calling collect()
enforces computation.
= time.time()
tic = q1.collect()
dat1 = time.time()
toc
dat1
passenger_count | tip_amount |
---|---|
i64 | f64 |
0 | 0.862099 |
1 | 1.151011 |
2 | 1.08158 |
3 | 0.962949 |
4 | 0.844519 |
… | … |
177 | 1.0 |
208 | 0.0 |
247 | 2.3 |
249 | 0.0 |
254 | 0.0 |
# print(f"Time difference of {toc - tic} seconds")
= Sys.time()
tic = q1$collect()
dat1 = Sys.time()
toc
dat1
passenger_count | tip_amount |
---|---|
i64 | f64 |
0 | 0.862099 |
1 | 1.151011 |
2 | 1.08158 |
3 | 0.962949 |
4 | 0.844519 |
… | … |
177 | 1.0 |
208 | 0.0 |
247 | 2.3 |
249 | 0.0 |
254 | 0.0 |
- tic toc
Time difference of 2.880659 secs
Aggregation
Subsetting along partition dimensions allows for even more efficiency gains.
= (
q2
nycfilter(pl.col("month") <= 3)
."month", "passenger_count"])
.group_by(["tip_amount").alias("mean_tip")])
.agg([pl.mean("passenger_count")
.sort( )
= (
q2
nyc$filter(pl$col("month") <= 3)
$group_by("month", "passenger_count")
$agg(pl$mean("tip_amount")$alias("mean_tip"))
$sort("passenger_count")
)
Let’s take a look at the optimized query that Polars will implement for us.
# q2 # naive
# optimized q2.show_graph()
# q2 # naive
cat(q2$explain()) # optimized
SORT BY [col("passenger_count")]
AGGREGATE
[col("tip_amount").mean().alias("mean_tip")] BY [col("month"), col("passenger_count")] FROM
Parquet SCAN [nyc-taxi/year=2012/month=1/part-0.parquet, ... 11 other files]
PROJECT 2/24 COLUMNS
SELECTION: [(col("month").cast(Unknown(Float))) <= (dyn float: 3.0)]
Now, let’s run the query and collect the results.
= time.time()
tic = q2.collect()
dat2 = time.time()
toc
dat2
month | passenger_count | mean_tip |
---|---|---|
i64 | i64 | f64 |
3 | 0 | 0.877675 |
1 | 0 | 0.841718 |
2 | 0 | 0.876637 |
3 | 1 | 1.089205 |
2 | 1 | 1.06849 |
… | … | … |
1 | 9 | 0.0 |
2 | 9 | 0.0 |
1 | 65 | 0.0 |
3 | 208 | 0.0 |
1 | 208 | 0.0 |
# print(f"Time difference of {toc - tic} seconds")
= Sys.time()
tic = q2$collect()
dat2 = Sys.time()
toc
dat2
month | passenger_count | mean_tip |
---|---|---|
i64 | i64 | f64 |
3 | 0 | 0.877675 |
1 | 0 | 0.841718 |
2 | 0 | 0.876637 |
3 | 1 | 1.089205 |
1 | 1 | 1.036863 |
… | … | … |
2 | 9 | 0.0 |
1 | 9 | 0.0 |
1 | 65 | 0.0 |
1 | 208 | 0.0 |
3 | 208 | 0.0 |
- tic toc
Time difference of 3.721074 secs
High-dimensional grouping example. This query provides an example where polars is noticeably slower than DuckDB.
= (
q3
nyc"passenger_count", "trip_distance"])
.group_by([
.agg(["tip_amount").alias("mean_tip"),
pl.mean("fare_amount").alias("mean_fare"),
pl.mean(
])"passenger_count", "trip_distance"])
.sort([
)
= time.time()
tic = q3.collect()
dat3 = time.time()
toc
dat3
passenger_count | trip_distance | mean_tip | mean_fare |
---|---|---|---|
i64 | f64 | f64 | f64 |
0 | 0.0 | 1.345135 | 17.504564 |
0 | 0.01 | 0.178571 | 34.642857 |
0 | 0.02 | 4.35 | 61.05 |
0 | 0.03 | 16.25 | 74.0 |
0 | 0.04 | 0.03 | 46.5 |
… | … | … | … |
208 | 5.1 | 0.0 | 12.5 |
208 | 6.6 | 0.0 | 17.7 |
247 | 3.31 | 2.3 | 11.5 |
249 | 1.69 | 0.0 | 8.5 |
254 | 1.02 | 0.0 | 6.0 |
# print(f"Time difference of {toc - tic} seconds")
= (
q3
nyc$group_by("passenger_count", "trip_distance")
$agg(
$mean("tip_amount")$alias("mean_tip"),
pl$mean("fare_amount")$alias("mean_fare")
pl
)$sort("passenger_count", "trip_distance")
)
= Sys.time()
tic = q3$collect()
dat3 = Sys.time()
toc
dat3
passenger_count | trip_distance | mean_tip | mean_fare |
---|---|---|---|
i64 | f64 | f64 | f64 |
0 | 0.0 | 1.345135 | 17.504564 |
0 | 0.01 | 0.178571 | 34.642857 |
0 | 0.02 | 4.35 | 61.05 |
0 | 0.03 | 16.25 | 74.0 |
0 | 0.04 | 0.03 | 46.5 |
… | … | … | … |
208 | 5.1 | 0.0 | 12.5 |
208 | 6.6 | 0.0 | 17.7 |
247 | 3.31 | 2.3 | 11.5 |
249 | 1.69 | 0.0 | 8.5 |
254 | 1.02 | 0.0 | 6.0 |
- tic toc
Time difference of 50.74207 secs
As an aside, if we didn’t care about column aliases (or sorting), then the previous query could be shortened to:
(
nyc"passenger_count", "trip_distance"])
.group_by(["tip_amount", "fare_amount"]).mean())
.agg(pl.col([
.collect() )
(
nyc$group_by("passenger_count", "trip_distance")
$agg(pl$col("tip_amount", "fare_amount")$mean())
$collect()
)
Pivot (reshape)
In polars, we have two distinct reshape methods:
pivot
: => long to wideunpivot
: => wide to long
Here we’ll unpivot to go from wide to long and use the eager execution engine (i.e., on the dat3
DataFrame object that we’ve already computed) for expediency.
= ["passenger_count", "trip_distance"]) dat3.unpivot(index
passenger_count | trip_distance | variable | value |
---|---|---|---|
i64 | f64 | str | f64 |
0 | 0.0 | "mean_tip" | 1.345135 |
0 | 0.01 | "mean_tip" | 0.178571 |
0 | 0.02 | "mean_tip" | 4.35 |
0 | 0.03 | "mean_tip" | 16.25 |
0 | 0.04 | "mean_tip" | 0.03 |
… | … | … | … |
208 | 5.1 | "mean_fare" | 12.5 |
208 | 6.6 | "mean_fare" | 17.7 |
247 | 3.31 | "mean_fare" | 11.5 |
249 | 1.69 | "mean_fare" | 8.5 |
254 | 1.02 | "mean_fare" | 6.0 |
$unpivot(index = c("passenger_count", "trip_distance")) dat3
passenger_count | trip_distance | variable | value |
---|---|---|---|
i64 | f64 | str | f64 |
0 | 0.0 | "mean_tip" | 1.345135 |
0 | 0.01 | "mean_tip" | 0.178571 |
0 | 0.02 | "mean_tip" | 4.35 |
0 | 0.03 | "mean_tip" | 16.25 |
0 | 0.04 | "mean_tip" | 0.03 |
… | … | … | … |
208 | 5.1 | "mean_fare" | 12.5 |
208 | 6.6 | "mean_fare" | 17.7 |
247 | 3.31 | "mean_fare" | 11.5 |
249 | 1.69 | "mean_fare" | 8.5 |
254 | 1.02 | "mean_fare" | 6.0 |
Joins (merges)
= nyc.group_by("month").agg(pl.col("tip_amount").mean())
mean_tips = nyc.group_by("month").agg(pl.col("fare_amount").mean()) mean_fares
(
mean_tips
.join(
mean_fares,= "month",
on = "left" # default is inner join
how
)
.collect() )
month | tip_amount | fare_amount |
---|---|---|
i64 | f64 | f64 |
9 | 1.254601 | 12.391198 |
3 | 1.056353 | 10.223107 |
11 | 1.250903 | 12.270138 |
6 | 1.091082 | 10.548651 |
8 | 1.079521 | 10.49265 |
… | … | … |
10 | 1.281239 | 12.501252 |
5 | 1.078014 | 10.585157 |
2 | 1.036874 | 9.94264 |
4 | 1.043167 | 10.33549 |
7 | 1.059312 | 10.379943 |
= nyc$group_by("month")$agg(pl$col("tip_amount")$mean())
mean_tips = nyc$group_by("month")$agg(pl$col("fare_amount")$mean()) mean_fares
(
mean_tips$join(
mean_fares,on = "month",
how = "left" # default is inner join
)$collect()
)
month | tip_amount | fare_amount |
---|---|---|
i64 | f64 | f64 |
7 | 1.059312 | 10.379943 |
9 | 1.254601 | 12.391198 |
8 | 1.079521 | 10.49265 |
6 | 1.091082 | 10.548651 |
12 | 1.237651 | 12.313953 |
… | … | … |
3 | 1.056353 | 10.223107 |
5 | 1.078014 | 10.585157 |
2 | 1.036874 | 9.94264 |
11 | 1.250903 | 12.270138 |
1 | 1.007817 | 9.813488 |
Appendix: Alternate interfaces
The native polars API is not the only way to interface with the underlying computation engine. Here are two alternate approaches that you may prefer, especially if you don’t want to learn a new syntax.
Ibis (Python)
The great advantage of Ibis (like dbplyr) is that it supports multiple backends through an identical frontend. So, all of our syntax logic and workflow from the Ibis+DuckDB section carry over to an equivalent Ibis+Polars workflow too. All you need to do is change the connection type. For example:
import ibis
import ibis.selectors as s
from ibis import _
##! This next line is the only thing that's changed !##
= ibis.polars.connect()
con
"nyc-taxi/**/*.parquet", "nyc") con.register(
<string>:2: FutureWarning: `Backend.register` is deprecated as of v9.1; use the explicit `read_*` method for the filetype you are trying to read, e.g., read_parquet, read_csv, etc.
/Users/gmcd/Documents/Projects/duckdb-polars/.venv/lib/python3.12/site-packages/ibis/backends/polars/__init__.py:75: PerformanceWarning: Resolving the schema of a LazyFrame is a potentially expensive operation. Use `LazyFrame.collect_schema()` to get the schema without this warning.
schema = PolarsSchema.to_ibis(self._tables[name].schema)
DatabaseTable: nyc
vendor_name string
pickup_datetime timestamp
dropoff_datetime timestamp
passenger_count int64
trip_distance float64
pickup_longitude float64
pickup_latitude float64
rate_code string
store_and_fwd string
dropoff_longitude float64
dropoff_latitude float64
payment_type string
fare_amount float64
extra float64
mta_tax float64
tip_amount float64
tolls_amount float64
total_amount float64
improvement_surcharge float64
congestion_surcharge float64
pickup_location_id int64
dropoff_location_id int64
= con.table("nyc")
nyc
(
nyc"passenger_count"])
.group_by([= _.tip_amount.mean())
.agg(mean_tip
.to_polars() )
shape: (18, 2)
┌─────────────────┬──────────┐
│ passenger_count ┆ mean_tip │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════════════════╪══════════╡
│ 6 ┆ 1.128365 │
│ 9 ┆ 0.8068 │
│ 3 ┆ 0.962949 │
│ 0 ┆ 0.862099 │
│ 66 ┆ 1.5 │
│ … ┆ … │
│ 8 ┆ 0.350769 │
│ 5 ┆ 1.102732 │
│ 65 ┆ 0.0 │
│ 208 ┆ 0.0 │
│ 247 ┆ 2.3 │
└─────────────────┴──────────┘
tidypolars (R)
The R package tidypolars (link) provides the “tidyverse” syntax while using polars as backend. The syntax and workflow should thus be immediately familar to R users.
It’s important to note that tidypolars is solely focused on the translation work. This means that you still need to load the main polars library alongside it for the actual computation, as well as dplyr (and potentially tidyr) for function generics.
library(polars) ## Already loaded
library(tidypolars)
library(dplyr, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
= scan_parquet_polars("nyc-taxi/**/*.parquet")
nyc
|>
nyc summarise(mean_tip = mean(tip_amount), .by = passenger_count) |>
compute()
passenger_count | mean_tip |
---|---|
i64 | f64 |
0 | 0.862099 |
9 | 0.8068 |
3 | 0.962949 |
6 | 1.128365 |
66 | 1.5 |
… | … |
5 | 1.102732 |
8 | 0.350769 |
65 | 0.0 |
208 | 0.0 |
247 | 2.3 |
Aside: Use collect()
instead of compute()
at the end if you would prefer to return a standard R data.frame instead of a Polars DataFrame.
See also polarssql (link) if you would like yet another “tidyverse”-esque alternative that works through DBI/d(b)plyr.