-
Notifications
You must be signed in to change notification settings - Fork 4
/
xbrl_parse.py
153 lines (135 loc) · 4.68 KB
/
xbrl_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from xml.dom import minidom
from urllib import urlopen
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
from datetime import datetime
class Company:
def __init__(self, cik):
self.cik = cik
self.documents = []
self.data = {}
url_string = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+self.cik+"&type=10-%25&dateb=&owner=exclude&start=0&count=400&output=atom"
xml_file = urlopen(url_string)
self.xml = minidom.parse(xml_file)
def get_company_info(self):
pass
def get_documents(self):
"""
Crawls edgar to get a list of all 10-Q/K XBRL files
"""
xml = self.xml
mapping = { 'name':'conformed-name',
'fiscal_year':'fiscal-year-end',
'state_location':'state-location',
'state_incorporation':'state-of-incorporation',
'sic':'assigned-sic',
'sic_desc':'assigned-sic-desc',
'cik':'cik' }
for k, v in mapping.iteritems():
try:
self.data[k] = xml.getElementsByTagName(v)[0].firstChild.nodeValue
except:
pass
document_list = xml.getElementsByTagName('entry')
for document in document_list:
try:
parent = document.getElementsByTagName('content')[0]
xbrl_href = parent.getElementsByTagName('xbrl_href')[0].firstChild.nodeValue
doc = Document(parent)
self.documents.append(doc)
except IndexError: #no xbrl data
pass
def __is_quarterly__(self, start, end):
date_format = "%Y-%m-%d"
start = datetime.strptime(start, date_format)
end = datetime.strptime(end, date_format)
delta = end - start
if delta.days <= 30 * 6:
return True
else:
return False
def get_series_from_id(self, xbrl_id):
'''
returns a dataframe with all of the series available from the xbrl_id provided
'''
if len(self.documents) == 0:
self.get_documents()
if len(self.documents) == 0:
raise Exception("No data available from Edgar")
documents = self.documents
processed = {}
for document in documents:
datas = document.get_item(xbrl_id)
if len(datas) == 0:
raise Exception("Error extracting data from documents")
for data in datas:
if not data['segment']:
data['segment'] = 'root'
series_name = data['segment']
if data['segment'] not in processed:
processed[series_name] = {}
if 'start' in data:
if self.__is_quarterly__(data['start'], data['end']):
processed[series_name][data['end']] = data['value']
else:
processed[series_name][data['end']] = data['value']
return pd.DataFrame(processed)
class Document:
'''
Represents data about a filing
'''
def __init__(self, filing):
'''
filing must be an XML node representing an Edgar filing
'''
self.data = {}
self.data['filing_date'] = filing.getElementsByTagName('filing-date')[0].firstChild.nodeValue
self.data['filing_type'] = filing.getElementsByTagName('filing-type')[0].firstChild.nodeValue
self.data['filing_url'] = filing.getElementsByTagName('filing-href')[0].firstChild.nodeValue
self.data['xbrl_url'] = self.__get_xbrl_url__()
def __get_xbrl_url__(self):
filing = urlopen(self.data['filing_url']).read()
soup = BeautifulSoup(filing)
xbrl_table = soup.findAll('table', attrs={'summary':"Data Files"})[0]
return 'http://www.sec.gov'+xbrl_table.findAll('a')[0]['href']
def __get_context_period__(self, soup, contextRef):
contexts = soup.getElementsByTagNameNS('*','context')
period = None
for context in contexts:
if context.attributes['id'].value == contextRef:
try: #stock
period = context.getElementsByTagNameNS('*','instant')[0].firstChild.nodeValue
period = {'end': period}
except: #flow
start = context.getElementsByTagNameNS('*','startDate')[0].firstChild.nodeValue
end = context.getElementsByTagNameNS('*','endDate')[0].firstChild.nodeValue
period = {'start': start, 'end': end}
return period
def __get_segment__(self, soup, contextRef):
contexts = soup.getElementsByTagNameNS('*','context')
segment = None
for context in contexts:
if context.attributes['id'].value == contextRef:
try:
segment = context.getElementsByTagNameNS('*','explicitMember')[0].firstChild.nodeValue
except:
pass
return segment
def get_item(self, code):
d = self.data['xbrl_url']
xbrl_data = urlopen(d)
soup = minidom.parse(xbrl_data)
datapoints = []
for point in soup.getElementsByTagNameNS('*',code):
contextRef = point.attributes['contextRef'].value
period = self.__get_context_period__(soup, contextRef)
period['value'] = point.firstChild.nodeValue
period['contextRef'] = contextRef
period['segment'] = self.__get_segment__(soup, contextRef)
datapoints.append(period)
return datapoints
def __str__(self):
return str(self.data)
#data = Company('MSFT')
#print data.get_series_from_id('Assets')['root']