Installation
pip install polars# Install Polars with all optional dependencies:
pip install 'polars[all]'
# You can also install a subset of all optional dependencies:
pip install 'polars[numpy,pandas,pyarrow]'
# We also have a conda package (however pip is the preferred way):
conda install -c conda-forge polars
Usage:
Importing Polars:import polars as pl
- Creating DataFrames:
# From dictionary
df = pl.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
# From list of dictionaries
df = pl.DataFrame([{'A': 1, 'B': 'a'}, {'A': 2, 'B': 'b'}])
# From CSV
df = pl.read_csv('file.csv')
# From Pandas DataFrame
pandas_df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
df = pl.from_pandas(pandas_df)
- Basic DataFrame Operations:
# Display DataFrame
print(df)
# Get DataFrame info
df.schema
# Select columns
df.select(['A', 'B'])
# Filter rows
df.filter(pl.col('A') > 2)
# Sort DataFrame
df.sort('A', reverse=True)
# Add new column
df.with_columns(pl.lit('new_col').alias('C'))
# Rename columns
df.rename({'A': 'X', 'B': 'Y'})
# Drop columns
df.drop(['A', 'B'])
# Group by and aggregate
df.groupby('A').agg(pl.sum('B'))
- Data Manipulation:
# Apply function to column
df.with_columns(pl.col('A').map(lambda x: x * 2).alias('A_doubled'))
# Fill null values
df.fill_null(strategy='forward')
# Replace values
df.with_columns(pl.col('A').replace({1: 10, 2: 20}))
# Melt DataFrame
df.melt(id_vars=['A'], value_vars=['B', 'C'])
# Pivot DataFrame
df.pivot(values='value', index='A', columns='variable')
- String Operations:
# Convert to uppercase
df.with_columns(pl.col('B').str.to_uppercase())
# String contains
df.filter(pl.col('B').str.contains('pattern'))
# String replace
df.with_columns(pl.col('B').str.replace('old', 'new'))
# String length
df.with_columns(pl.col('B').str.lengths().alias('B_length'))
- DateTime Operations:
# Parse strings to datetime
df.with_columns(pl.col('date').str.strptime(pl.Datetime, '%Y-%m-%d'))
# Extract components
df.with_columns(pl.col('date').dt.year().alias('year'))
# Date arithmetic
df.with_columns((pl.col('date') + pl.duration(days=1)).alias('next_day'))
- Joining DataFrames:
# Inner join
df1.join(df2, on='key', how='inner')
# Left join
df1.join(df2, on='key', how='left')
# Outer join
df1.join(df2, on='key', how='outer')
- Window Functions:
# Cumulative sum
df.with_columns(pl.col('A').cum_sum().over('B'))
# Rolling average
df.with_columns(pl.col('A').rolling_mean(window_size=3).over('B'))
# Rank
df.with_columns(pl.col('A').rank().over('B'))
- IO Operations:
# Write to CSV
df.write_csv('output.csv')
# Write to Parquet
df.write_parquet('output.parquet')
# Read Parquet
pl.read_parquet('file.parquet')
- Lazy Execution:
# Create lazy DataFrame
lazy_df = pl.scan_csv('large_file.csv')
# Define operations
result = lazy_df.filter(pl.col('A') > 0).groupby('B').agg(pl.sum('C'))
# Execute lazy computation
result.collect()
- Advanced Filtering:
# Multiple conditions
df.filter((pl.col('A') > 5) & (pl.col('B') < 10))
# Is in list
df.filter(pl.col('A').is_in([1, 3, 5]))
# Is null
df.filter(pl.col('A').is_null())
# Is between
df.filter(pl.col('A').is_between(5, 10))
- Sampling:
# Random sample
df.sample(n=10)
# Stratified sample
df.group_by('category').sample(n=5)
- Set Operations:
# Union
df1.vstack(df2)
# Intersection
df1.join(df2, on='key', how='inner')
# Difference
df1.join(df2, on='key', how='anti')
- Advanced Aggregations:
# Multiple aggregations
df.groupby('A').agg([
pl.sum('B').alias('B_sum'),
pl.mean('C').alias('C_mean'),
pl.n_unique('D').alias('D_unique_count')
])
# Custom aggregation
df.groupby('A').agg(pl.col('B').agg_groups(lambda x: x.sum() / x.count()))
- Reshaping Data:
# Explode a list column
df.with_columns(pl.col('list_col').explode())
# Concatenate string columns
df.with_columns(pl.concat_str(['A', 'B'], separator='-').alias('A_B'))
- Time Series Operations:
# Resample time series
df.group_by_dynamic('timestamp', every='1h').agg(pl.sum('value'))
# Shift values
df.with_columns(pl.col('A').shift(1).alias('A_lagged'))
# Difference between consecutive rows
df.with_columns((pl.col('A') - pl.col('A').shift(1)).alias('A_diff'))
- Missing Data Handling:
# Drop rows with any null values
df.drop_nulls()
# Drop rows where specific columns have null values
df.drop_nulls(subset=['A', 'B'])
# Interpolate missing values
df.with_columns(pl.col('A').interpolate())
- Data Type Operations:
# Cast column to different type
df.with_columns(pl.col('A').cast(pl.Float64))
# Get unique values
df.select(pl.col('A').unique())
# Count unique values
df.select(pl.col('A').n_unique())
- Advanced String Operations:
# Extract using regex
df.with_columns(pl.col('text').str.extract(r'(\d+)', group_index=1))
# Split string into multiple columns
df.with_columns([
pl.col('full_name').str.split(' ').list.get(0).alias('first_name'),
pl.col('full_name').str.split(' ').list.get(1).alias('last_name')
])
- Window Functions with Custom Sorting:
# Cumulative sum with custom sorting
df.with_columns(
pl.col('value')
.cum_sum()
.over(['category', 'subcategory'])
.sort('date')
)
- Conditional Expressions:
# When-Then-Otherwise
df.with_columns(
pl.when(pl.col('A') > 5)
.then(pl.lit('High'))
.when(pl.col('A') < 2)
.then(pl.lit('Low'))
.otherwise(pl.lit('Medium'))
.alias('A_category')
)
- Advanced IO Operations:
# Read JSON
pl.read_json('file.json')
# Read from database
pl.read_database(query='SELECT * FROM table', connection_uri='postgresql://user:pass@host/db')
# Write to database
df.write_database(table_name='my_table', connection_uri='postgresql://user:pass@host/db')
- Performance Optimization:
# Parallel execution
pl.Config.set_num_threads(4)
# Streaming mode for large CSV files
for batch in pl.read_csv('large_file.csv', batch_size=10000):
process_batch(batch)
- Expressions and Custom Functions:
# Custom function
def custom_func(x):
return x * 2 + 1
# Apply custom function
df.with_columns(pl.col('A').map(custom_func).alias('A_custom'))
# Complex expressions
df.with_columns(
((pl.col('A') * 2 + pl.col('B')) / pl.col('C')).alias('complex_calc')
)
- List Operations:
# Get list length
df.with_columns(pl.col('list_col').list.lengths().alias('list_length'))
# Get nth element from list
df.with_columns(pl.col('list_col').list.get(1).alias('second_element'))
# Join list elements
df.with_columns(pl.col('list_col').list.join(',').alias('joined_list'))
# Slice list
df.with_columns(pl.col('list_col').list.slice(0, 3).alias('first_three'))
- Struct Operations:
# Create struct column
df.with_columns(pl.struct(['A', 'B']).alias('AB_struct'))
# Access struct field
df.with_columns(pl.col('AB_struct')['A'].alias('A_from_struct'))
# Unnest struct
df.unnest('AB_struct')
- Advanced Groupby Operations:
# Rolling groupby
df.groupby_rolling('date', period='7d').agg(pl.sum('value'))
# Dynamic groupby
df.groupby_dynamic('timestamp', every='1h', offset='30m').agg(pl.mean('value'))
# Groupby with exclusions
df.groupby('category', maintain_order=True).agg(
pl.all().exclude(['category', 'id'])
)
- Vectorized User-Defined Functions (UDFs):
import numpy as np
# Numpy UDF
@pl.api.register_vectorized_udfs(input_type=[pl.Float64], return_type=pl.Float64)
def custom_log(x):
return np.log(x)
df.with_columns(custom_log(pl.col('A')).alias('A_log'))
- Meta Operations:
# Get column names
df.columns
# Get dtypes
df.dtypes
# Get shape
df.shape
# Memory usage
df.estimated_size()
- Advanced Joining:
# Asof join
df1.join_asof(df2, left_on='date', right_on='date', by='id')
# Cross join
df1.join(df2, how='cross')
# Fuzzy join
df1.join(df2, left_on='name', right_on='name', how='left', algorithm='fuzzy', matcher='levenshtein', threshold=2)
- Polars-specific Optimizations:
# Predicate pushdown
(df.lazy()
.filter(pl.col('A') > 0)
.groupby('B')
.agg(pl.sum('C'))
.collect())
# Projection pushdown
(df.lazy()
.select(['A', 'B'])
.filter(pl.col('A') > 0)
.collect())
- Working with Missing Data:
# Fill null with different values based on condition
df.with_columns(
pl.when(pl.col('A').is_null())
.then(pl.col('B'))
.otherwise(pl.col('A'))
.alias('A_filled')
)
# Fill null with forward fill and a limit
df.with_columns(pl.col('A').fill_null(strategy='forward', limit=2))
- Advanced DateTime Operations:
# Truncate to specific time unit
df.with_columns(pl.col('datetime').dt.truncate('1d').alias('day_start'))
# Get day of week
df.with_columns(pl.col('date').dt.weekday().alias('weekday'))
# Date range
pl.date_range(start='2021-01-01', end='2021-12-31', interval='1d')
- Statistical Functions:
# Covariance
df.select(pl.covariance('A', 'B'))
# Correlation
df.select(pl.corr('A', 'B'))
# Quantile
df.select(pl.col('A').quantile(0.75))
- Advanced String Matching:
# Fuzzy matching
df.with_columns(
pl.col('text').str.fuzzy_match('pattern', threshold=80).alias('fuzzy_match')
)
This cheat sheet covers many functions and operations in Polars, which has many more features and capabilities, including advanced filtering, reshaping, time series operations, struct operations, vectorized UDFs, meta operations, performance and Polars-specific optimizations. please refer to the official docs for the most up-to-date and more comprehensive information on available functions and best practices.
## Installation ```bash pip install polars # Install Polars with all optional dependencies: pip install 'polars[all]' # You can also install a subset of all optional dependencies: pip install 'polars[numpy,pandas,pyarrow]' # We also have a conda package (however pip is the preferred way): conda install -c conda-forge polars ``` ## Usage: 1. Importing Polars: ```python import polars as pl ``` 2. Creating DataFrames: ```python # From dictionary df = pl.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) # From list of dictionaries df = pl.DataFrame([{'A': 1, 'B': 'a'}, {'A': 2, 'B': 'b'}]) # From CSV df = pl.read_csv('file.csv') # From Pandas DataFrame pandas_df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) df = pl.from_pandas(pandas_df) ``` 3. Basic DataFrame Operations: ```python # Display DataFrame print(df) # Get DataFrame info df.schema # Select columns df.select(['A', 'B']) # Filter rows df.filter(pl.col('A') > 2) # Sort DataFrame df.sort('A', reverse=True) # Add new column df.with_columns(pl.lit('new_col').alias('C')) # Rename columns df.rename({'A': 'X', 'B': 'Y'}) # Drop columns df.drop(['A', 'B']) # Group by and aggregate df.groupby('A').agg(pl.sum('B')) ``` 4. Data Manipulation: ```python # Apply function to column df.with_columns(pl.col('A').map(lambda x: x * 2).alias('A_doubled')) # Fill null values df.fill_null(strategy='forward') # Replace values df.with_columns(pl.col('A').replace({1: 10, 2: 20})) # Melt DataFrame df.melt(id_vars=['A'], value_vars=['B', 'C']) # Pivot DataFrame df.pivot(values='value', index='A', columns='variable') ``` 5. String Operations: ```python # Convert to uppercase df.with_columns(pl.col('B').str.to_uppercase()) # String contains df.filter(pl.col('B').str.contains('pattern')) # String replace df.with_columns(pl.col('B').str.replace('old', 'new')) # String length df.with_columns(pl.col('B').str.lengths().alias('B_length')) ``` 6. DateTime Operations: ```python # Parse strings to datetime df.with_columns(pl.col('date').str.strptime(pl.Datetime, '%Y-%m-%d')) # Extract components df.with_columns(pl.col('date').dt.year().alias('year')) # Date arithmetic df.with_columns((pl.col('date') + pl.duration(days=1)).alias('next_day')) ``` 7. Joining DataFrames: ```python # Inner join df1.join(df2, on='key', how='inner') # Left join df1.join(df2, on='key', how='left') # Outer join df1.join(df2, on='key', how='outer') ``` 8. Window Functions: ```python # Cumulative sum df.with_columns(pl.col('A').cum_sum().over('B')) # Rolling average df.with_columns(pl.col('A').rolling_mean(window_size=3).over('B')) # Rank df.with_columns(pl.col('A').rank().over('B')) ``` 9. IO Operations: ```python # Write to CSV df.write_csv('output.csv') # Write to Parquet df.write_parquet('output.parquet') # Read Parquet pl.read_parquet('file.parquet') ``` 10. Lazy Execution: ```python # Create lazy DataFrame lazy_df = pl.scan_csv('large_file.csv') # Define operations result = lazy_df.filter(pl.col('A') > 0).groupby('B').agg(pl.sum('C')) # Execute lazy computation result.collect() ``` 11. Advanced Filtering: ```python # Multiple conditions df.filter((pl.col('A') > 5) & (pl.col('B') < 10)) # Is in list df.filter(pl.col('A').is_in([1, 3, 5])) # Is null df.filter(pl.col('A').is_null()) # Is between df.filter(pl.col('A').is_between(5, 10)) ``` 12. Sampling: ```python # Random sample df.sample(n=10) # Stratified sample df.group_by('category').sample(n=5) ``` 13. Set Operations: ```python # Union df1.vstack(df2) # Intersection df1.join(df2, on='key', how='inner') # Difference df1.join(df2, on='key', how='anti') ``` 14. Advanced Aggregations: ```python # Multiple aggregations df.groupby('A').agg([ pl.sum('B').alias('B_sum'), pl.mean('C').alias('C_mean'), pl.n_unique('D').alias('D_unique_count') ]) # Custom aggregation df.groupby('A').agg(pl.col('B').agg_groups(lambda x: x.sum() / x.count())) ``` 15. Reshaping Data: ```python # Explode a list column df.with_columns(pl.col('list_col').explode()) # Concatenate string columns df.with_columns(pl.concat_str(['A', 'B'], separator='-').alias('A_B')) ``` 16. Time Series Operations: ```python # Resample time series df.group_by_dynamic('timestamp', every='1h').agg(pl.sum('value')) # Shift values df.with_columns(pl.col('A').shift(1).alias('A_lagged')) # Difference between consecutive rows df.with_columns((pl.col('A') - pl.col('A').shift(1)).alias('A_diff')) ``` 17. Missing Data Handling: ```python # Drop rows with any null values df.drop_nulls() # Drop rows where specific columns have null values df.drop_nulls(subset=['A', 'B']) # Interpolate missing values df.with_columns(pl.col('A').interpolate()) ``` 18. Data Type Operations: ```python # Cast column to different type df.with_columns(pl.col('A').cast(pl.Float64)) # Get unique values df.select(pl.col('A').unique()) # Count unique values df.select(pl.col('A').n_unique()) ``` 19. Advanced String Operations: ```python # Extract using regex df.with_columns(pl.col('text').str.extract(r'(\d+)', group_index=1)) # Split string into multiple columns df.with_columns([ pl.col('full_name').str.split(' ').list.get(0).alias('first_name'), pl.col('full_name').str.split(' ').list.get(1).alias('last_name') ]) ``` 20. Window Functions with Custom Sorting: ```python # Cumulative sum with custom sorting df.with_columns( pl.col('value') .cum_sum() .over(['category', 'subcategory']) .sort('date') ) ``` 21. Conditional Expressions: ```python # When-Then-Otherwise df.with_columns( pl.when(pl.col('A') > 5) .then(pl.lit('High')) .when(pl.col('A') < 2) .then(pl.lit('Low')) .otherwise(pl.lit('Medium')) .alias('A_category') ) ``` 22. Advanced IO Operations: ```python # Read JSON pl.read_json('file.json') # Read from database pl.read_database(query='SELECT * FROM table', connection_uri='postgresql://user:pass@host/db') # Write to database df.write_database(table_name='my_table', connection_uri='postgresql://user:pass@host/db') ``` 23. Performance Optimization: ```python # Parallel execution pl.Config.set_num_threads(4) # Streaming mode for large CSV files for batch in pl.read_csv('large_file.csv', batch_size=10000): process_batch(batch) ``` 24. Expressions and Custom Functions: ```python # Custom function def custom_func(x): return x * 2 + 1 # Apply custom function df.with_columns(pl.col('A').map(custom_func).alias('A_custom')) # Complex expressions df.with_columns( ((pl.col('A') * 2 + pl.col('B')) / pl.col('C')).alias('complex_calc') ) ``` 25. List Operations: ```python # Get list length df.with_columns(pl.col('list_col').list.lengths().alias('list_length')) # Get nth element from list df.with_columns(pl.col('list_col').list.get(1).alias('second_element')) # Join list elements df.with_columns(pl.col('list_col').list.join(',').alias('joined_list')) # Slice list df.with_columns(pl.col('list_col').list.slice(0, 3).alias('first_three')) ``` 26. Struct Operations: ```python # Create struct column df.with_columns(pl.struct(['A', 'B']).alias('AB_struct')) # Access struct field df.with_columns(pl.col('AB_struct')['A'].alias('A_from_struct')) # Unnest struct df.unnest('AB_struct') ``` 27. Advanced Groupby Operations: ```python # Rolling groupby df.groupby_rolling('date', period='7d').agg(pl.sum('value')) # Dynamic groupby df.groupby_dynamic('timestamp', every='1h', offset='30m').agg(pl.mean('value')) # Groupby with exclusions df.groupby('category', maintain_order=True).agg( pl.all().exclude(['category', 'id']) ) ``` 28. Vectorized User-Defined Functions (UDFs): ```python import numpy as np # Numpy UDF @pl.api.register_vectorized_udfs(input_type=[pl.Float64], return_type=pl.Float64) def custom_log(x): return np.log(x) df.with_columns(custom_log(pl.col('A')).alias('A_log')) ``` 29. Meta Operations: ```python # Get column names df.columns # Get dtypes df.dtypes # Get shape df.shape # Memory usage df.estimated_size() ``` 30. Advanced Joining: ```python # Asof join df1.join_asof(df2, left_on='date', right_on='date', by='id') # Cross join df1.join(df2, how='cross') # Fuzzy join df1.join(df2, left_on='name', right_on='name', how='left', algorithm='fuzzy', matcher='levenshtein', threshold=2) ``` 31. Polars-specific Optimizations: ```python # Predicate pushdown (df.lazy() .filter(pl.col('A') > 0) .groupby('B') .agg(pl.sum('C')) .collect()) # Projection pushdown (df.lazy() .select(['A', 'B']) .filter(pl.col('A') > 0) .collect()) ``` 32. Working with Missing Data: ```python # Fill null with different values based on condition df.with_columns( pl.when(pl.col('A').is_null()) .then(pl.col('B')) .otherwise(pl.col('A')) .alias('A_filled') ) # Fill null with forward fill and a limit df.with_columns(pl.col('A').fill_null(strategy='forward', limit=2)) ``` 33. Advanced DateTime Operations: ```python # Truncate to specific time unit df.with_columns(pl.col('datetime').dt.truncate('1d').alias('day_start')) # Get day of week df.with_columns(pl.col('date').dt.weekday().alias('weekday')) # Date range pl.date_range(start='2021-01-01', end='2021-12-31', interval='1d') ``` 34. Statistical Functions: ```python # Covariance df.select(pl.covariance('A', 'B')) # Correlation df.select(pl.corr('A', 'B')) # Quantile df.select(pl.col('A').quantile(0.75)) ``` 35. Advanced String Matching: ```python # Fuzzy matching df.with_columns( pl.col('text').str.fuzzy_match('pattern', threshold=80).alias('fuzzy_match') ) ``` Source: https://gist.github.com