Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Mahmoud Mansour committed Mar 18, 2018
0 parents commit 99d40af
Showing 22 changed files with 610 additions and 0 deletions.
132 changes: 132 additions & 0 deletions Benchmarker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import matplotlib.pyplot as plt
import seaborn as sns
from time import monotonic
import pandas as pd
import numpy as np


class Benchmarker:
def __init__(self, df_generator, functions_to_evaluate, title, user_df_size_powers=None,
user_loop_size_powers=None, largest_df_single_test=True):
"""
Parameters
----------
df_generator: string , a string containing the needed command to generate the test dataframe.
functions_to_evaluate: List[function], a list of functions to be evaluated.
user_df_size_powers: List[int] containing the log10(sizes) of the test_dfs (optional).
user_loop_size_powers: List[int] containing the log10(sizes) of the loops_sizes (optional).
"""

self.df_generator = df_generator
self.functions_to_evaluate = functions_to_evaluate
self.df_size_powers = [2, 3, 4, 5, 6, 7, 8] if user_df_size_powers is None else user_df_size_powers
self.loop_size_powers = [4, 4, 3, 3, 2, 1, 1] if user_loop_size_powers is None else user_loop_size_powers
self.loop_size_powers[-1] = 0 if largest_df_single_test else 1


self.benchmark_results = []
self.title = title
self.valid = self.validate_functions()
if not self.valid:
print("WARNING: evaluated functions return different results.")

def validate_functions(self):
functions_results = []
df_size = 10 ** self.df_size_powers[0]
df = eval(self.df_generator)
for function_to_evaluate in self.functions_to_evaluate:
functions_results.append(function_to_evaluate(df))

valid = True
for i in range(len(functions_results)):
for j in range(i + 1, len(functions_results)):
if isinstance(functions_results[i], pd.DataFrame):
if not functions_results[i].equals(functions_results[j]): valid = False
elif isinstance(functions_results[i], np.ndarray):
if not np.array_equal(functions_results[i], functions_results[j]): valid = False
else:
try:
if not (functions_results[i] == functions_results[j]): valid = False
except Exception as e:
valid = False

return valid

def benchmark_time(self, function_to_evaluate):
"""
Creates a test_df with 'df_generator', and runs 'function_to_evaluate' N times, where N = len(df_size_powers)
For each run i, a test_df of size 10 ** self.df_size_power[i] is created, and the function_to_evaluate is run
for 10 ** loop_size_power[i] times.
Returns
-------
A list of size N containing the average
"""
results = []

for df_size_power, loop_size_power in zip(self.df_size_powers, self.loop_size_powers):
df_size = 10 ** df_size_power
print("\tTesting with a dataframe of size: ", df_size)
df = eval(self.df_generator)

loop_size = 10 ** loop_size_power

start_time = monotonic()

for loop_counter in range(loop_size):
function_to_evaluate(df)

end_time = monotonic()
per_loop_time = (end_time - start_time) / loop_size
print("\tResult (seconds): ", per_loop_time)
results.append(per_loop_time)

return results

def benchmark_all(self):
"""
Benchmarks all functions in functions_to_evaluate; saves result in benchmark_results.
"""
for func in self.functions_to_evaluate:
print("Benchmarking function: ", func.__name__)
self.benchmark_results.append(self.benchmark_time(func))

def plot_results(self):
sns.set_style("darkgrid")

fig, ax = plt.subplots(2, 1, figsize=(7, 14))

plt.sca(ax[0])

for result, function_name in zip(self.benchmark_results, self.functions_to_evaluate):
plt.semilogy(list(range(len(result))), result, marker="o", label=function_name.__name__)

plt.title(self.title, fontsize=15)
plt.ylabel("Seconds", fontsize=13)
plt.xticks(range(len(self.df_size_powers)), ["$10^{}$".format(x) for x in self.df_size_powers])
plt.legend(frameon=True)

plt.sca(ax[1])
scaled_results = []
for result in self.benchmark_results:
scaled_results.append(np.divide(np.array(result), np.array(self.benchmark_results[0])))

max_diff = np.max(scaled_results)
if max_diff < 3:
plt.ylim(ymax=3)

for result, function_name in zip(scaled_results, self.functions_to_evaluate):
plt.plot(list(range(len(result))), result, marker="o", label=function_name.__name__)

plt.title(self.title, fontsize=15)
plt.ylabel("w.r.t. to '{}' time".format(self.functions_to_evaluate[0].__name__), fontsize=13)
plt.xlabel("Dataframe size", fontsize=13)
plt.xticks(range(len(self.df_size_powers)), ["$10^{}$".format(x) for x in self.df_size_powers])
plt.legend(frameon=True)
plt.savefig("exports/{}.png".format(self.title), bbox_inches="tight")
plt.show()

def print_results(self):
for x in self.benchmark_results:
print(x)
221 changes: 221 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#Fast Pandas
#### A Benchmarked Pandas Cheat Sheet
Pandas is one of the most flexible and powerful tools available for data scientists and developers. Being very flexible, one can perform a given task in several ways. This project aims to benchmark the different available methods in such situations; moreover, there is a special section for functions found in both numpy and pandas.

## Introduction:
This project is not intended to only show the obtained results but also to provide others with a simple method for benchmarking different operations and sharing their results.

Below is a quick example of how to use the benchmarking class:

from Benchmarker import Benchmarker
import numpy as np
def pandas_sum(df):
return df["A"].sum()
def numpy_sum(df):
return np.sum(df["A"])
params = {
"df_generator": 'pd.DataFrame(np.random.randint(1, df_size, (df_size, 2)), columns=list("AB"))',
"functions_to_evaluate": [pandas_sum, numpy_sum],
"title": "Pandas Sum vs Numpy Sum",
}
benchmark = Benchmarker(**params)
benchmark.benchmark_all()
benchmark.print_results()
benchmark.plot_results()

The first parameter passed to the class constructor is ***df_generator*** which is simply a function that generates a random dataframe. This function has to be define in terms of ***df_size*** so that different dataframes with increasing sizes are generated. The second parameter is the list of functions to be evaluated, and the last one is the title of the resulting plot.

Calling ***plot_results( )*** will show and save a plot like the one shown below containing two subplots:
* The first subplot shows the *average* time it has taken each function to run against different dataframe sizes. Note that this is a semilog plot, i.e. the y-axis is shown in log scale.
* The second subplot shows how other functions performed with respect to the first function.

You can clearly see that pandas sum is slightly faster than numpy sum, for dataframes below one million rows.

![](https://i.imgur.com/Wq39R0U.png)

### Results Summary:
![](https://i.imgur.com/ADrrPtd.png)


## 1 - Pandas benchmark.

#### 1.1 Dropping duplicate rows:
There are severals methods for dropping duplicate rows in pandas, three of which are tested below:

def duplicated(df):
return df[~df["A"].duplicated(keep="first")].reset_index(drop=True)

def drop_duplicates(df):
return df.drop_duplicates(subset="A", keep="first").reset_index(drop=True)

def group_by_drop(df):
return df.groupby(df["A"], as_index=False, sort=False).first()

* ***duplicated* is the fastest; irrespective of size.**
* **The *group_by* drop shows an interesting trend. It could be possible for it to be faster than duplicated for data frames larger than 100 million rows.**

![](https://i.imgur.com/T2rk3qc.png)

#### 1.2 - Iterating over all rows:
Tested functions:

def iterrows_function(df):
for index, row in df.iterrows():
pass
def itertuples_function(df: pd.DataFrame):
for row in df.itertuples():
pass
- **itertuples is significantly faster than iterrows (up to 50 times faster)**
![](https://i.imgur.com/CjjCCoB.png)



#### 1.3 - Selecting rows:
Tested functions:

def query_selection(df):
return df[(df["A"] > 0) & (df["A"] < 100)]

def bracket_selection(df):
return df.query("A > 0 and A < 100")

def loc_selection(df):
return df.loc[(df["A"] > 0) & (df["A"] < 100)]

def ne_selection(df):
A = df["A"].values
return df[ne.evaluate("(A > 0) & (A < 100)")]

def ne_create_selection(df):
A = df["A"].values
mask = ne.evaluate("(A > 0) & (A < 100)")
return pd.DataFrame(df.values[mask], df.index[mask], df.columns)

* ***ne_create_selection* is the fastest method for dataframes smaller than 10000 rows, followed bt *ne_selection* for larger data frames.**
* ***loc and query selections* are identical in performance.**
* **Square bracket selection is the slowest method.**
![](https://i.imgur.com/Vc2NKOY.png)
#### 1.4 - Creating a new column:
Tested functions:

def regular(df):
df["E"] = df["A"] * df["B"] + df["C"]

def eval_method(df):
df.eval("E = A * B + C", inplace=True)

* **The regular method is faster than the eval method.**
* **eval_methods shows and interesting erratic behavior that I couldn't explain; however, I repeated the test several times with different mathematical operations and still reproduced the same plot every time.**
![enter image description here](https://i.imgur.com/RWqPHXj.png)


## 2 - Pandas vs Numpy.

This section tests the performance of functions that are found in both numpy and padnas.
#### 2.1 - Summation performance:
Tested functions:

def pandas_sum(df):
return df["A"].sum()

def numpy_sum(df):
return np.sum(df["A"])
* **pandas sum is slightly faster than numpy sum, for dataframes below one million rows.**

![](https://i.imgur.com/Wq39R0U.png)



#### 2.2 - Sort performance:
Tested functions:

def pandas_sort(df):
return df["A"].sort_values()
def numpy_sort(df):
return np.sort(df["A"])

* **numpy_sort is considerably faster than pandas, irrespective of size; although they both use quicksort as the default sorting algorithm.**

![](https://i.imgur.com/V9AK0pK.png)


#### 2.3 - Unique performance:
Tested functions:

def pandas_unique(df):
return df["A"].unique()

def numpy_unique(df):
return np.unique(df["A"])
* **For data frames over 100 rows pandas unique is faster than numpy.**
* **It is worth noting that unlike pandas unique, numpy unique returns a sorted array, which explains the discrepancy in results**

![](https://i.imgur.com/YDREzNo.png)

#### 2.4 - Median performance:
Tested functions:

def pandas_median(df):
return df["A"].median()

def numpy_median(df):
return np.median(df["A"])

* **No significant statistical difference in performance.**
![](https://i.imgur.com/tFxos1W.png)
#### 2.5 - Mean performance:
Tested functions:

def pandas_mean(df):
return df["A"].mean()

def numpy_mean(df):
return np.mean(df["A"])
* **pandas mean is slightly faster than numpy mean, for dataframes below one million rows.**

![](https://i.imgur.com/AXzJ4Dx.png)

#### 2.6 - Product performance:
Tested functions:

def pandas_prod(df):
return df["A"].prod()

def numpy_prod(df):
return np.prod(df["A"])
* **pandas product is slightly faster than numpy product, for dataframes below one million rows.**

![](https://i.imgur.com/NmLHueA.png)

## Extra notes:

#### Extra parameters:
The class constructor has three other optional parameters:

"user_df_size_powers": List[int] containing the log10(sizes) of the test_dfs
"user_loop_size_powers": List[int] containing the log10(sizes) of the loops_sizes
"largest_df_single_test" (defualt = True)
You can pass custom sizes for the dataframes and loops used in benchmarking, this is suggested when there seems to be noise in th results; i.e. you are unable to maintain consistency over different runs.
The third parameter, *largest_df_single_test*, is set to true by default; since the last dataframe has 100 million rows and for some operations it will take a large amount of time to complete a single task.

#### Warnings:
The benchmarker will warn you if the results returned by the evaluated functions are not identical. You might not need to worry about that, as it has been shown in the benchmarking of the *np.unique* function above.


## Future work:
#### -Using median instead of mean as it is less prone to noise.
#### -Benchmarking memory consumption.


----------

Got something on your mind you would like to benchmark ? We are waiting for your results.

21 changes: 21 additions & 0 deletions benchmark_create_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from Benchmarker import Benchmarker


def regular(df):
df["E"] = df["A"] * df["B"] + df["C"]


def eval_method(df):
df.eval("E = A * B + C", inplace=True)


params = {
"df_generator": 'pd.DataFrame(np.random.randint(1, df_size, (df_size, 4)), columns=list("ABCD"))',
"functions_to_evaluate": [regular, eval_method],
"title": "Benchmark for column creation",
}

benchmark = Benchmarker(**params)
benchmark.benchmark_all()
benchmark.print_results()
benchmark.plot_results()
Loading

0 comments on commit 99d40af

Please sign in to comment.