-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_to_df.py
148 lines (110 loc) · 3.98 KB
/
html_to_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import os
import os
from pathlib import Path
### MAIN FUNCTIONS ###
def html_price_print_outputs(html_path, downloads_route=True):
'''
Desc: Take a locally stored HTML file of FB marketplace
with items with prices listed.
Args:
- html_path : str : windows file path
Returns:
Results:
- Stores a csv of the price data as 'output.csv' to local path
'''
readable = convert_html_path_to_readble(html_path)
prices, outliers, og_n = convert_html_to_prices(readable)
str_out = ''
str_out += 'Removed {} ({:.0%}) outliers from {} data points'.format(
outliers,
outliers/og_n,
og_n
)
avg, distro = beautify_data(prices)
str_out += '\n The mean price for this FB Marketplace data is ${:.2f} \n {}'.format(
avg,
distro
)
print(str_out)
if downloads_route:
downloads_path = os.path.join(Path.home(), "Downloads")
dump_path = downloads_path + '/output.csv'
else:
dump_path = os.getcwd() + '/output.csv'
prices.to_csv(dump_path)
print('File saved to {}.'.format(dump_path))
def html_price_to_outputs(html_path):
'''
A function that takes the key information and
outputs it in a predictable format (TBD) to be fed to
the chrome extension.
'''
pass
### UTIL FUNCTIONS ###
# convert_html_path_to_readable
# convert_html_to_prices
# beautify_data
def convert_html_path_to_readble(html_path):
with open(html_path, 'r', encoding='utf-8') as file:
data_str = file.read()
return data_str
def convert_html_to_prices(html_str, remove_outliers=True):
'''
Take an HTML and convert into price index dataseries
needs to input information about sample size and also the number of outliers removed
Spit out dictionary of:
#TODO: eventually need to extract other useful data
#TODO: assume that is USD for now. Search string looks for $ but can be swapped with other currencies
Inputs:
html_str : str : FB marketplace html converted to string
remote_outliers : bool : whether to remove 2 std from price distribution
Returns tuple:
0 : pd.Series(int) : prices
1 : int : outliers removed
2 : int : original data points
'''
# First convert into BS4 obj
soup = BeautifulSoup(html_str, 'html.parser')
matches = re.findall(r"\$\d{1,6}(?:\.\d{1,2})?", soup.get_text())
nums = np.array([int(x[1:]) for x in matches])
# Remove outliers
outliers_removed = 0
# https://stackoverflow.com/questions/11686720/is-there-a-numpy-builtin-to-reject-outliers-from-a-list
if remove_outliers:
def reject_outliers(data, m = 2.):
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else np.zeros(len(d))
return data[s<m]
nums = reject_outliers(nums)
outliers_removed = len(matches) - len(nums)
return pd.Series(nums), outliers_removed, len(matches)
def beautify_data(price_series):
'''
Turning our price series data into intelligible outputs for user simplicity.
Input:
price_series : pd.Series(int) : data of prices
Output: Tuple
0 : price_num : float : the mean price of the price_series
1: price_distro_str : str : the string describing the distribution
'''
price_num = np.mean(price_series)
desc_series = price_series.describe()
distro_25 = desc_series.loc['25%']
distro_75 = desc_series.loc['75%']
sample_size = desc_series.loc['count']
price_distro_str = ('The 25th-75th percentile range of prices is' +
' ${:.0f}-${:.0f} based on a sample size of {:.0f}'.format(
distro_25,
distro_75,
sample_size
)
)
return price_num, price_distro_str
pass
if __name__ == "__main__":
html_price_print_outputs("test_data.html")