-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustom_funcs.py
75 lines (64 loc) · 2.26 KB
/
custom_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import pandas as pd
def split_count(x):
'''
Parameters:
x (Series): is the column of interest
Returns:
out (Pandas DataFrame): dataframe with counts of items in x
'''
assert isinstance(x,pd.Series)
dummy_dict = {}
for i in x:
try :
dummy_dict[i] += 1
# dummy_dict[value] +=1
except KeyError:
dummy_dict.update({i:1})
# dummy_dict.update({value:1})
out = pd.DataFrame.from_dict(dummy_dict, orient='index').rename(columns={0:'count'}).sort_values(by='count')
return out
def group_top_and_other(df, num_entries = 6):
'''
Parameters:
df (DataFrame): dataframe with counts (index are what has been counter)
ex. output of split_count.
num_entries (int): how many independent entries do you want
Returns:
out (Pandas DaraFrame): same dataframe but the smallest only 8 entries are passed
rest are combined to an 'other' entry
'''
assert isinstance(df, pd.DataFrame)
assert (num_entries > 0)
df_entries = len(list(df.index))
assert (df_entries >= num_entries)
size_other = df_entries - num_entries
other_indexes = list(df.index)[0:size_other]
other_sum = int(sum(df.loc[other_indexes].values))
new_df = df.iloc[size_other:]
df2 = pd.Series({'count':other_sum})
df2.name = 'Other'
new_df = new_df.append(df2)
new_df = new_df.sort_values(by = 'count', axis = 0)
return new_df
def split_count2(x,instances, values):
'''
Parameters:
x (df): is the df of interest
instances(str): name of column to use for counts
values(str): name of column to use as values
Returns:
out (Pandas DataFrame): dataframe with sum of values
'''
assert isinstance(x,pd.DataFrame)
df = x[[instances,values]]
length = df.shape[0]
dummy_dict = {}
for row in range(length):
# print(df.loc[row][0])
try :
dummy_dict[df.loc[row][0]] += df.loc[row][1]
except KeyError:
dummy_dict.update({df.loc[row][0]:df.loc[row][1]})
out = pd.DataFrame.from_dict(dummy_dict, orient='index').rename(columns={0:'count'}).sort_values(by='count')
return out