Table of Contents
You can use the following methods to calculate a cumulative sum in a PySpark DataFrame:
Method 1: Calculate Cumulative Sum of One Column
from pyspark.sql import Window
from pyspark.sql import functions as F
#define window for calculating cumulative sum
my_window = (Window.orderBy('day')
.rowsBetween(Window.unboundedPreceding, 0))
#create new DataFrame that contains cumulative sales column
df_new = df.withColumn('cum_sales', F.sum('sales').over(my_window))
Method 2: Calculate Cumulative Sum of One Column, Grouped by Another Column
from pyspark.sql import Window
from pyspark.sql import functions as F
#define window for calculating cumulative sum
my_window = (Window.partitionBy('store').orderBy('day')
.rowsBetween(Window.unboundedPreceding, 0))
#create new DataFrame that contains cumulative sales, grouped by store
df_new = df.withColumn('cum_sales', F.sum('sales').over(my_window))
The following examples show how to use each method in practice.
Example 1: Calculate Cumulative Sum of One Column
Suppose we have the following PySpark DataFrame that contains information about the sales made during 10 consecutive days at some grocery store:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
#define data
data = [[1, 11],
[2, 8],
[3, 4],
[4, 5],
[5, 5],
[6, 8],
[7, 7],
[8, 7],
[9, 6],
[10, 4]]
#define column names
columns = ['day', 'sales']
#create dataframe using data and column names
df = spark.createDataFrame(data, columns)
#view dataframe
df.show()
+---+-----+
|day|sales|
+---+-----+
| 1| 11|
| 2| 8|
| 3| 4|
| 4| 5|
| 5| 5|
| 6| 8|
| 7| 7|
| 8| 7|
| 9| 6|
| 10| 4|
+---+-----+
We can use the following syntax to calculate the cumulative sum of the values in the sales column:
from pyspark.sql import Window
from pyspark.sql import functions as F
#define window for calculating cumulative sum
my_window = (Window.orderBy('day')
.rowsBetween(Window.unboundedPreceding, 0))
#create new DataFrame that contains cumulative sales column
df_new = df.withColumn('cum_sales', F.sum('sales').over(my_window))
#view new DataFrame
df_new.show()
+---+-----+---------+
|day|sales|cum_sales|
+---+-----+---------+
| 1| 11| 11|
| 2| 8| 19|
| 3| 4| 23|
| 4| 5| 28|
| 5| 5| 33|
| 6| 8| 41|
| 7| 7| 48|
| 8| 7| 55|
| 9| 6| 61|
| 10| 4| 65|
+---+-----+---------+
The resulting DataFrame contains a new column called cum_sales that shows the cumulative values in the sales column.
Example 2: Calculate Cumulative Sum of One Column, Grouped by Another Column
Suppose we have the following PySpark DataFrame that contains information about the sales made during 5 consecutive days at two different grocery stores:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
#define data
data = [['A', 1, 11],
['A', 2, 8],
['A', 3, 4],
['A', 4, 5],
['A', 5, 5],
['B', 6, 8],
['B', 7, 7],
['B', 8, 7],
['B', 9, 6],
['B', 10, 4]]
#define column names
columns = ['store', 'day', 'sales']
#create dataframe using data and column names
df = spark.createDataFrame(data, columns)
#view dataframe
df.show()
+-----+---+-----+
|store|day|sales|
+-----+---+-----+
| A| 1| 11|
| A| 2| 8|
| A| 3| 4|
| A| 4| 5|
| A| 5| 5|
| B| 6| 8|
| B| 7| 7|
| B| 8| 7|
| B| 9| 6|
| B| 10| 4|
+-----+---+-----+
We can use the following syntax to calculate the cumulative sum of the values in the sales column, grouped by the values in the store column:
from pyspark.sql import Window
from pyspark.sql import functions as F
#define window for calculating cumulative sum
my_window = (Window.partitionBy('store').orderBy('day')
.rowsBetween(Window.unboundedPreceding, 0))
#create new DataFrame that contains cumulative sales, grouped by store
df_new = df.withColumn('cum_sales', F.sum('sales').over(my_window))
#view new DataFrame
df_new.show()
+-----+---+-----+---------+
|store|day|sales|cum_sales|
+-----+---+-----+---------+
| A| 1| 11| 11|
| A| 2| 8| 19|
| A| 3| 4| 23|
| A| 4| 5| 28|
| A| 5| 5| 33|
| B| 6| 8| 8|
| B| 7| 7| 15|
| B| 8| 7| 22|
| B| 9| 6| 28|
| B| 10| 4| 32|
+-----+---+-----+---------+