first commit

gaopgx · Mar 18, 2018 · 99d40af · 99d40af
commit 99d40af
Showing 22 changed files with 610 additions and 0 deletions.
diff --git a/Benchmarker.py b/Benchmarker.py
@@ -0,0 +1,132 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+from time import monotonic
+import pandas as pd
+import numpy as np
+
+
+class Benchmarker:
+    def __init__(self, df_generator, functions_to_evaluate, title, user_df_size_powers=None,
+                 user_loop_size_powers=None, largest_df_single_test=True):
+        """
+        Parameters
+        ----------
+        df_generator: string , a string containing the needed command to generate the test dataframe.
+        functions_to_evaluate: List[function], a list of functions to be evaluated.
+        user_df_size_powers: List[int] containing the log10(sizes) of the test_dfs (optional).
+        user_loop_size_powers: List[int] containing the log10(sizes) of the loops_sizes (optional).
+        """
+
+        self.df_generator = df_generator
+        self.functions_to_evaluate = functions_to_evaluate
+        self.df_size_powers = [2, 3, 4, 5, 6, 7, 8] if user_df_size_powers is None else user_df_size_powers
+        self.loop_size_powers = [4, 4, 3, 3, 2, 1, 1] if user_loop_size_powers is None else user_loop_size_powers
+        self.loop_size_powers[-1] = 0 if largest_df_single_test else 1
+
+
+        self.benchmark_results = []
+        self.title = title
+        self.valid = self.validate_functions()
+        if not self.valid:
+            print("WARNING: evaluated functions return different results.")
+
+    def validate_functions(self):
+        functions_results = []
+        df_size = 10 ** self.df_size_powers[0]
+        df = eval(self.df_generator)
+        for function_to_evaluate in self.functions_to_evaluate:
+            functions_results.append(function_to_evaluate(df))
+
+        valid = True
+        for i in range(len(functions_results)):
+            for j in range(i + 1, len(functions_results)):
+                if isinstance(functions_results[i], pd.DataFrame):
+                    if not functions_results[i].equals(functions_results[j]): valid = False
+                elif isinstance(functions_results[i], np.ndarray):
+                    if not np.array_equal(functions_results[i], functions_results[j]): valid = False
+                else:
+                    try:
+                        if not (functions_results[i] == functions_results[j]): valid = False
+                    except Exception as e:
+                        valid = False
+
+        return valid
+
+    def benchmark_time(self, function_to_evaluate):
+        """
+        Creates a test_df with 'df_generator', and runs 'function_to_evaluate' N times, where N = len(df_size_powers)
+        For each run i, a test_df of size 10 ** self.df_size_power[i] is created, and the function_to_evaluate is run
+        for 10 ** loop_size_power[i] times.
+
+        Returns
+        -------
+        A list of size N containing the average
+
+        """
+        results = []
+
+        for df_size_power, loop_size_power in zip(self.df_size_powers, self.loop_size_powers):
+            df_size = 10 ** df_size_power
+            print("\tTesting with a dataframe of size: ", df_size)
+            df = eval(self.df_generator)
+
+            loop_size = 10 ** loop_size_power
+
+            start_time = monotonic()
+
+            for loop_counter in range(loop_size):
+                function_to_evaluate(df)
+
+            end_time = monotonic()
+            per_loop_time = (end_time - start_time) / loop_size
+            print("\tResult (seconds): ", per_loop_time)
+            results.append(per_loop_time)
+
+        return results
+
+    def benchmark_all(self):
+        """
+        Benchmarks all functions in functions_to_evaluate; saves result in benchmark_results.
+        """
+        for func in self.functions_to_evaluate:
+            print("Benchmarking function: ", func.__name__)
+            self.benchmark_results.append(self.benchmark_time(func))
+
+    def plot_results(self):
+        sns.set_style("darkgrid")
+
+        fig, ax = plt.subplots(2, 1, figsize=(7, 14))
+
+        plt.sca(ax[0])
+
+        for result, function_name in zip(self.benchmark_results, self.functions_to_evaluate):
+            plt.semilogy(list(range(len(result))), result, marker="o", label=function_name.__name__)
+
+        plt.title(self.title, fontsize=15)
+        plt.ylabel("Seconds", fontsize=13)
+        plt.xticks(range(len(self.df_size_powers)), ["$10^{}$".format(x) for x in self.df_size_powers])
+        plt.legend(frameon=True)
+
+        plt.sca(ax[1])
+        scaled_results = []
+        for result in self.benchmark_results:
+            scaled_results.append(np.divide(np.array(result), np.array(self.benchmark_results[0])))
+
+        max_diff = np.max(scaled_results)
+        if max_diff < 3:
+            plt.ylim(ymax=3)
+
+        for result, function_name in zip(scaled_results, self.functions_to_evaluate):
+            plt.plot(list(range(len(result))), result, marker="o", label=function_name.__name__)
+
+        plt.title(self.title, fontsize=15)
+        plt.ylabel("w.r.t. to '{}' time".format(self.functions_to_evaluate[0].__name__), fontsize=13)
+        plt.xlabel("Dataframe size", fontsize=13)
+        plt.xticks(range(len(self.df_size_powers)), ["$10^{}$".format(x) for x in self.df_size_powers])
+        plt.legend(frameon=True)
+        plt.savefig("exports/{}.png".format(self.title), bbox_inches="tight")
+        plt.show()
+
+    def print_results(self):
+        for x in self.benchmark_results:
+            print(x)
diff --git a/README.md b/README.md
@@ -0,0 +1,221 @@
+#Fast Pandas
+#### A Benchmarked Pandas Cheat Sheet
+Pandas is one of the most flexible and powerful tools available for data scientists and developers. Being very flexible, one can perform a given task in several ways. This project aims to benchmark the different available methods in such situations; moreover, there is a special section for functions found in both numpy and pandas.
+
+## Introduction:
+This project is not intended to only show the obtained results but also to provide others with a simple method for benchmarking different operations and sharing their results.
+
+Below is a quick example of how to use the benchmarking class:
+
+    from Benchmarker import Benchmarker
+    import numpy as np
+        
+    def pandas_sum(df):
+        return df["A"].sum()
+        
+    def numpy_sum(df):
+        return np.sum(df["A"])
+        
+    params = {
+        "df_generator": 'pd.DataFrame(np.random.randint(1, df_size, (df_size, 2)), columns=list("AB"))',
+        "functions_to_evaluate": [pandas_sum, numpy_sum],
+        "title": "Pandas Sum vs Numpy Sum",
+    }
+    benchmark = Benchmarker(**params)
+    benchmark.benchmark_all()
+    benchmark.print_results()
+    benchmark.plot_results()
+
+The first parameter passed to the class constructor is ***df_generator*** which is simply a function that generates a random dataframe. This function has to be define in terms of ***df_size*** so that different dataframes with increasing sizes are generated. The second parameter is the list of functions to be evaluated, and the last one is the title of the resulting plot. 
+
+Calling ***plot_results( )*** will show and save a plot like the one shown below containing two subplots:
+* The first subplot shows the *average* time it has taken each function to run against different dataframe sizes. Note that this is a semilog plot, i.e. the y-axis is shown in log scale. 
+* The second subplot shows how other functions performed with respect to the first function.
+
+You can clearly see that pandas sum is slightly faster than numpy sum, for dataframes below one million rows.
+
+![](https://i.imgur.com/Wq39R0U.png)
+
+### Results Summary:
+![](https://i.imgur.com/ADrrPtd.png)
+
+
+## 1 - Pandas benchmark.
+
+#### 1.1 Dropping duplicate rows:
+There are severals  methods for dropping duplicate rows in pandas, three of which are tested below:
+
+    def duplicated(df):
+        return df[~df["A"].duplicated(keep="first")].reset_index(drop=True)
+
+    def drop_duplicates(df):
+        return df.drop_duplicates(subset="A", keep="first").reset_index(drop=True)
+
+    def group_by_drop(df):
+        return df.groupby(df["A"], as_index=False, sort=False).first()
+
+* ***duplicated* is the fastest; irrespective of size.**
+* **The *group_by* drop shows an interesting trend. It could be possible for it to be faster than duplicated for data frames larger than 100 million rows.**
+
+![](https://i.imgur.com/T2rk3qc.png)
+
+#### 1.2 - Iterating over all rows:
+Tested functions:
+
+    def iterrows_function(df):  
+      for index, row in df.iterrows():  
+      pass  
+     
+    def itertuples_function(df: pd.DataFrame):  
+      for row in df.itertuples():  
+      pass
+      
+ - **itertuples is significantly faster than iterrows (up to 50 times faster)**
+![](https://i.imgur.com/CjjCCoB.png)
+
+
+
+#### 1.3 - Selecting rows:
+Tested functions:
+
+    def query_selection(df):
+        return df[(df["A"] > 0) & (df["A"] < 100)]
+
+    def bracket_selection(df):
+        return df.query("A > 0 and A < 100")
+
+    def loc_selection(df):
+        return df.loc[(df["A"] > 0) & (df["A"] < 100)]
+
+    def ne_selection(df):
+        A = df["A"].values
+        return df[ne.evaluate("(A > 0) & (A < 100)")]
+
+    def ne_create_selection(df):
+        A = df["A"].values
+        mask = ne.evaluate("(A > 0) & (A < 100)")
+        return pd.DataFrame(df.values[mask], df.index[mask], df.columns)
+
+ * ***ne_create_selection* is the fastest method for dataframes smaller than 10000 rows, followed bt *ne_selection* for larger data frames.**
+ * ***loc and query selections* are identical in performance.**
+ * **Square bracket selection is the slowest method.**
+![](https://i.imgur.com/Vc2NKOY.png)
+#### 1.4 - Creating a new column:
+Tested functions:
+
+    def regular(df):
+	    df["E"] = df["A"] * df["B"] + df["C"]
+
+	def eval_method(df):
+	    df.eval("E = A * B + C", inplace=True)
+
+ * **The regular method is faster than the eval method.**
+* **eval_methods shows and interesting erratic behavior that I couldn't explain; however, I repeated the test several times with different mathematical operations and still reproduced the same plot every time.**
+![enter image description here](https://i.imgur.com/RWqPHXj.png)
+
+
+## 2 - Pandas vs Numpy.
+
+This section tests the performance of functions that are found in both numpy and padnas. 
+#### 2.1 - Summation performance:
+Tested functions:
+
+    def pandas_sum(df):
+        return df["A"].sum()
+
+    def numpy_sum(df):
+        return np.sum(df["A"])
+      
+ * **pandas sum is slightly faster than numpy sum, for dataframes below one million rows.**
+
+   ![](https://i.imgur.com/Wq39R0U.png)
+
+
+
+#### 2.2 - Sort performance:
+Tested functions:
+
+    def pandas_sort(df):  
+      return df["A"].sort_values()  
+                  
+    def numpy_sort(df):  
+      return np.sort(df["A"])
+
+* **numpy_sort is considerably faster than pandas, irrespective of size; although they both use quicksort as the default sorting algorithm.**
+
+![](https://i.imgur.com/V9AK0pK.png)
+
+
+#### 2.3 - Unique performance:
+Tested functions:
+
+	def pandas_unique(df):
+	    return df["A"].unique()
+
+	def numpy_unique(df):
+	    return np.unique(df["A"])
+* **For data frames over 100 rows pandas unique is faster than numpy.**
+* **It is worth noting that unlike pandas unique, numpy unique returns a sorted array, which explains the discrepancy in results**
+
+![](https://i.imgur.com/YDREzNo.png)
+
+#### 2.4 - Median performance:
+Tested functions:
+
+    def pandas_median(df):
+        return df["A"].median()
+
+    def numpy_median(df):
+        return np.median(df["A"])
+
+* **No significant statistical difference in performance.**
+![](https://i.imgur.com/tFxos1W.png)
+#### 2.5 - Mean performance:
+Tested functions:
+
+    def pandas_mean(df):
+        return df["A"].mean()
+
+    def numpy_mean(df):
+        return np.mean(df["A"])
+ * **pandas mean is slightly faster than numpy mean, for dataframes below one million rows.**
+
+![](https://i.imgur.com/AXzJ4Dx.png)
+
+#### 2.6 - Product performance:
+Tested functions:
+
+    def pandas_prod(df):
+        return df["A"].prod()
+
+    def numpy_prod(df):
+        return np.prod(df["A"])
+        
+ * **pandas product is slightly faster than numpy product, for dataframes below one million rows.**
+
+![](https://i.imgur.com/NmLHueA.png)
+
+## Extra notes:
+
+#### Extra parameters: 
+The class constructor has three other optional parameters:
+
+    "user_df_size_powers": List[int] containing the log10(sizes) of the test_dfs
+    "user_loop_size_powers": List[int] containing the log10(sizes) of the loops_sizes
+    "largest_df_single_test" (defualt = True)
+You can pass custom sizes for the dataframes and loops used in benchmarking, this is suggested when there seems to be noise in th results; i.e. you are unable to maintain consistency over different runs. 
+The third parameter, *largest_df_single_test*, is set to true by default; since the last dataframe has 100 million rows and for some operations it will take a large amount of time to complete a single task.
+
+#### Warnings:
+The benchmarker will warn you if the results returned by the evaluated functions are not identical. You might not need to worry about that, as it has been shown in the benchmarking of the *np.unique* function above.
+
+
+## Future work:
+####   -Using median instead of mean as it is less prone to noise. 
+####  -Benchmarking memory consumption.
+
+
+----------
+
+Got something  on your mind you would like to benchmark ? We are waiting for your results. 
+
diff --git a/benchmark_create_column.py b/benchmark_create_column.py
@@ -0,0 +1,21 @@
+from Benchmarker import Benchmarker
+
+
+def regular(df):
+    df["E"] = df["A"] * df["B"] + df["C"]
+
+
+def eval_method(df):
+    df.eval("E = A * B + C", inplace=True)
+
+
+params = {
+    "df_generator": 'pd.DataFrame(np.random.randint(1, df_size, (df_size, 4)), columns=list("ABCD"))',
+    "functions_to_evaluate": [regular, eval_method],
+    "title": "Benchmark for column creation",
+}
+
+benchmark = Benchmarker(**params)
+benchmark.benchmark_all()
+benchmark.print_results()
+benchmark.plot_results()