forked from neverforgit/PeMS_Tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtime_df_growth.py
96 lines (83 loc) · 2.97 KB
/
time_df_growth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
from numpy.random import rand
import pandas as pd
from utils.timer import Timer
# Some constants
num_dfs = 500 # Number of random dataframes to generate
n_rows = 2500
n_cols = 40
n_reps = 100 # Number of repetitions for timing
# Generate a list of num_dfs dataframes of random values
df_list = [pd.DataFrame(rand(n_rows*n_cols).reshape((n_rows, n_cols)), columns=np.arange(n_cols)) for i in np.arange(num_dfs)]
##
# Define two methods of growing a large dataframe
##
# Method 1 - append dataframes
def method1():
out_df1 = pd.DataFrame(columns=np.arange(4))
for df in df_list:
out_df1 = out_df1.append(df, ignore_index=True)
return out_df1
# Method 2 - preallocated empty dataframe size
def method2():
# Create an empty dataframe that is big enough to hold all the dataframes in df_list
out_df2 = pd.DataFrame(columns=np.arange(n_cols), index=np.arange(num_dfs*n_rows))
# Set the dtypes of each column
for ix, col in enumerate(out_df2.columns):
out_df2[col] = out_df2[col].astype(df_list[0].dtypes[ix])
# Fill in the values
for ix, df in enumerate(df_list):
out_df2.iloc[ix*n_rows:(ix+1)*n_rows, :] = df.values
return out_df2
# Method 3 - preallocate dataframe with fake data of appropriate type
def method3():
# Create fake data array
data = np.transpose(np.array([np.empty(n_rows*num_dfs, dtype=dt) for dt in df_list[0].dtypes]))
# Create placeholder dataframe
out_df3 = pd.DataFrame(data)
# Fill in the real values
for ix, df in enumerate(df_list):
out_df3.iloc[ix*n_rows:(ix+1)*n_rows, :] = df.values
return out_df3
# Method 4 - us pd.concat on df_list
def method4():
return pd.concat(df_list, ignore_index=True)
##
# Time both methods
##
# Time Method 1
times_1 = np.empty(n_reps)
for i in np.arange(n_reps):
with Timer() as t:
df1 = method1()
times_1[i] = t.secs
print 'Total time for %d repetitions of Method 1: %f [sec]' % (n_reps, np.sum(times_1))
print 'Best time: %f' % (np.min(times_1))
print 'Mean time: %f' % (np.mean(times_1))
# Time Method 2
times_2 = np.empty(n_reps)
for i in np.arange(n_reps):
with Timer() as t:
df2 = method2()
times_2[i] = t.secs
print 'Total time for %d repetitions of Method 2: %f [sec]' % (n_reps, np.sum(times_2))
print 'Best time: %f' % (np.min(times_2))
print 'Mean time: %f' % (np.mean(times_2))
# Time Method 3
times_3 = np.empty(n_reps)
for i in np.arange(n_reps):
with Timer() as t:
df3 = method3()
times_3[i] = t.secs
print 'Total time for %d repetitions of Method 3: %f [sec]' % (n_reps, np.sum(times_3))
print 'Best time: %f' % (np.min(times_3))
print 'Mean time: %f' % (np.mean(times_3))
# Time Method 4
times_4 = np.empty(n_reps)
for i in np.arange(n_reps):
with Timer() as t:
df4 = method4()
times_4[i] = t.secs
print 'Total time for %d repetitions of Method 4: %f [sec]' % (n_reps, np.sum(times_4))
print 'Best time: %f' % (np.min(times_4))
print 'Mean time: %f' % (np.mean(times_4))