Skip to content

Commit

Permalink
Moved over to more efficient csv export. Instead of holding all the d…
Browse files Browse the repository at this point in the history
…ata in memory, we iterate through each survey writing one line at a time to the csv file.
  • Loading branch information
amarder committed May 1, 2011
1 parent 3119013 commit 3065b5b
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 189 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ search_index
ve/

# file created by tests
registration.xml
registration.xml

# folder to hold csv files
csvs
4 changes: 2 additions & 2 deletions base_templates/dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
</td>
<td>
{% if xform.submission_count %}
<a href={% url parsed_xforms.views.xls id_string=xform.id_string %}>xls</a>
<a href={% url parsed_xforms.views.csv_export id_string=xform.id_string %}>csv</a>
{% endif %}
</td>
</tr>
Expand Down Expand Up @@ -1880,4 +1880,4 @@ <h2>

</script>

{% endblock %}
{% endblock %}
138 changes: 65 additions & 73 deletions parsed_xforms/models/data_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,28 @@ def get_element(self, abbreviated_xpath):
self._survey_elements = {}
for e in self.get_survey_elements():
self._survey_elements[e.get_abbreviated_xpath()] = e
return self._survey_elements.get(abbreviated_xpath)
def remove_all_indices(xpath):
return re.sub(r"\[\d+\]", u"", xpath)
clean_xpath = remove_all_indices(abbreviated_xpath)
return self._survey_elements.get(clean_xpath)

def get_label(self, abbreviated_xpath):
e = self.get_element(abbreviated_xpath)
# todo: think about multiple language support
if e: return e.get_label()

def remove_from_spreadsheet(self, abbreviated_xpath):
def _remove_unwanted_keys(self, d):
# we will remove respondents 4 and above
def respondent_index_above_two(abbreviated_xpath):
def respondent_index_above_three(abbreviated_xpath):
m = re.search(r"^respondent\[(\d+)\]/", abbreviated_xpath)
if m:
return int(m.group(1)) > 3
return False
if respondent_index_above_two(abbreviated_xpath): return True
e = self.get_element(abbreviated_xpath)
if e is None: return False
if e.get_bind().get(u"readonly")==u"true()":
return True
return False
for k in d.keys():
if respondent_index_above_three(k): del d[k]
e = self.get_element(k)
if e is None: continue
if e.get_bind().get(u"readonly")==u"true()": del d[k]

def get_xpath_cmp(self):
if not hasattr(self, "_xpaths"):
Expand Down Expand Up @@ -102,12 +104,10 @@ def get_variable_name(self, abbreviated_xpath):
def get_parsed_instances_from_mongo(self):
id_string = self.xform.id_string
match_id_string = {XFORM_ID_STRING : id_string}
parsed_instances = \
xform_instances.find(spec=match_id_string)
return list(parsed_instances)
return xform_instances.find(spec=match_id_string)

def _rename_key(self, is_key_to_rename, new_key, data):
for d in data:
def _rename_state_and_lga_keys(self, d):
def rename_key(is_key_to_rename, new_key):
candidates = [k for k in d.keys() if is_key_to_rename(k)]
for k in candidates:
if d[k] is None:
Expand All @@ -120,39 +120,43 @@ def _rename_key(self, is_key_to_rename, new_key, data):
assert new_key not in d
d[new_key] = d[candidates[0]]
del d[candidates[0]]

def _collapse_other_into_select_one(self, data):
for d in data:
candidates = [k for k in d.keys() if k.endswith(u"_other")]
for other_key in candidates:
root_key = other_key[:-len(u"_other")]
e = self.get_element(root_key)
if e.get_bind().get(u"type")==u"select1":
if d[root_key]==u"other":
d[root_key] = d[other_key]
del d[other_key]

def _expand_select_all_that_apply(self, data):
def remove_all_indices(xpath):
return re.sub(r"\[\d+\]", u"", xpath)
for d in data:
for key in d.keys():
e = self.get_element(remove_all_indices(key))
if e and e.get_bind().get(u"type")==u"select":
options_selected = None if d[key] is None else d[key].split()
for i, child in enumerate(e.get_children()):
new_key = key + u"[%s]" % i
if options_selected is None:
d[new_key] = u"n/a"
elif child.get_name() in options_selected:
assert new_key not in d
d[new_key] = True
if child.get_name()==u"other":
d[new_key] = d[key + u"_other"]
del d[key + u"_other"]
else:
d[new_key] = False
del d[key]
renamer = {
u"state" : lambda x: x.startswith(u"location/state_in_"),
u"lga" : lambda x: x.startswith(u"location/lga_in_"),
}
for k, v in renamer.items(): rename_key(v, k)

def _collapse_other_into_select_one(self, d):
candidates = [k for k in d.keys() if k.endswith(u"_other")]
for other_key in candidates:
root_key = other_key[:-len(u"_other")]
e = self.get_element(root_key)
if e.get_bind().get(u"type")==u"select1":
if d[root_key]==u"other":
d[root_key] = d[other_key]
del d[other_key]

def _expand_select_all_that_apply(self, d):
for key in d.keys():
e = self.get_element(key)
if e and e.get_bind().get(u"type")==u"select":
options_selected = None if d[key] is None else d[key].split()
for i, child in enumerate(e.get_children()):
# this is a hack to get things ordered correctly
# this needs to coordinate with the get variable
# name method.
new_key = key + u"[%s]" % i
if options_selected is None:
d[new_key] = u"n/a"
elif child.get_name() in options_selected:
assert new_key not in d
d[new_key] = 1
if child.get_name()==u"other":
d[new_key] = d[key + u"_other"]
del d[key + u"_other"]
else:
d[new_key] = 0
del d[key]

def _rename_select_all_option_key(self, hacky_name):
"""
Expand All @@ -169,31 +173,19 @@ def _rename_select_all_option_key(self, hacky_name):
u"_" + child.get_name()
return None

def _remove_index_from_first_instance_of_repeat(self, data):
for d in data:
candidates = [k for k in d.keys() if u"[1]" in k]
for key in candidates:
new_key = re.sub(r"\[1\]", "", key)
assert new_key not in d
d[new_key] = d[key]
del d[key]
def _remove_index_from_first_instance_of_repeat(self, d):
candidates = [k for k in d.keys() if u"[1]" in k]
for key in candidates:
new_key = re.sub(r"\[1\]", "", key)
assert new_key not in d
d[new_key] = d[key]
del d[key]

def get_data_for_excel(self):
result = self.get_parsed_instances_from_mongo()
self._collapse_other_into_select_one(result)
self._remove_index_from_first_instance_of_repeat(result)
self._rename_key(lambda x: x.startswith(u"location/state_in_"), u"state", result)
self._rename_key(lambda x: x.startswith(u"location/lga_in_"), u"lga", result)
self._expand_select_all_that_apply(result)
return result

def get_column_keys_for_excel(self):
def unique_keys(data):
s = set()
for d in data:
for k in d.keys():
s.add(k)
return list(s)
result = unique_keys(self.get_data_for_excel())
result.sort(cmp=self.get_column_key_cmp())
return [key for key in result if not self.remove_from_spreadsheet(key)]
for d in self.get_parsed_instances_from_mongo():
self._collapse_other_into_select_one(d)
self._remove_index_from_first_instance_of_repeat(d)
self._rename_state_and_lga_keys(d)
self._expand_select_all_that_apply(d)
self._remove_unwanted_keys(d)
yield d
2 changes: 1 addition & 1 deletion parsed_xforms/templates/export_list.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
</td>
<td>
{% if xform.submission_count %}
<a href={% url parsed_xforms.views.xls id_string=xform.id_string %}>xls</a>
<a href={% url parsed_xforms.views.csv_export id_string=xform.id_string %}>csv</a>
{% endif %}
</td>
</tr>
Expand Down
2 changes: 1 addition & 1 deletion parsed_xforms/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

urlpatterns = patterns('',
url(r"^survey-list/?$", views.export_list),
url(r"^export_spreadsheet/(?P<id_string>[^/]*)\.xls$", views.xls),
url(r"^export_spreadsheet/(?P<id_string>[^/]*)\.csv$", views.csv_export),
url(r"^map_data_points/(?P<lga_id>\d+)/$", views.map_data_points),
url(r"^survey/(?P<pk>\d+)/$", views.survey_responses),
url(r"^survey_image_urls/(?P<pk>\d+)/$", views.survey_media_files),
Expand Down
2 changes: 1 addition & 1 deletion parsed_xforms/views/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from old_views import *
from xls_export import xls
from csv_export import csv_export
from map_json import map_data_points
from single_survey_submission import survey_responses, survey_media_files
from median_survey_lengths import median_survey_lengths
Expand Down
134 changes: 134 additions & 0 deletions parsed_xforms/views/csv_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import codecs
import os
import re

class CsvWriter(object):
"""
Gosh darn csv package doesn't work with unicode.
This class is a simple wrapper around csv.writer. It takes a row
iterator and a path to the file to write.
This class takes a dict iterator, a key comparator (for sorting
the keys), and a function to rename the keys.
"""
def __init__(self):
self._dict_iterator = []
self._keys = []
self._key_comparator = cmp
self._key_rename_function = lambda x: x

def set_generator_function(self, generator_function):
"""
Unfortunately there doesn't seem to be a way to rewind a
generator, so instead of just passing a generator as my
dict_iterator, I'm passing the generator function so we can
get a new generator after we run through the first.
"""
self._generator_function = generator_function
self._reset_dict_iterator()
self._create_list_of_keys()

# def set_dict_iterator(self, dict_iterator):
# self._dict_iterator = dict_iterator
# self._create_list_of_keys()

def _reset_dict_iterator(self):
self._dict_iterator = self._generator_function()

def _create_list_of_keys(self):
key_set = set()
for d in self._dict_iterator:
for k in d.iterkeys():
key_set.add(k)
self._keys = list(key_set)
self._reset_dict_iterator()

def set_key_comparator(self, key_comparator):
self._key_comparator = key_comparator

def _sort_keys(self):
self._keys.sort(cmp=self._key_comparator)

def set_key_rename_function(self, key_rename_function):
self._key_rename_function = key_rename_function

def _ensure_directory_exists(self, path):
directory = os.path.dirname(path)
try:
os.makedirs(directory)
except:
pass

def write_to_file(self, path):
self._ensure_directory_exists(path)
self._file_object = codecs.open(path, mode="w", encoding="utf-8")

self._sort_keys()
headers = [self._key_rename_function(k) for k in self._keys]
self._write_row(headers)

for d in self._dict_iterator:
# todo: figure out how to use csv.writer with unicode
self._write_row([d.get(k, u"n/a") for k in self._keys])
self._reset_dict_iterator()

self._file_object.close()

def _write_row(self, row):
quote_escaped_row = []
for cell in row:
cell_string = unicode(cell)
cell_string = re.sub(ur"\s+", u" ", cell_string)
if u',' in cell_string:
quote_escaped_row.append(u'"%s"' % cell_string)
else:
quote_escaped_row.append(cell_string)
row_string = u",".join(quote_escaped_row)
self._file_object.writelines([row_string, u"\n"])


from parsed_xforms.models import DataDictionary

class DataDictionaryWriter(CsvWriter):

def __init__(self):
super(DataDictionaryWriter, self).__init__()
self._data_dictionary = None

def set_data_dictionary(self, data_dictionary):
self._data_dictionary = data_dictionary

generator_function = data_dictionary.get_data_for_excel
self.set_generator_function(generator_function)

key_comparator = data_dictionary.get_column_key_cmp()
self.set_key_comparator(key_comparator)

key_rename_function = data_dictionary.get_variable_name
self.set_key_rename_function(key_rename_function)

# http://djangosnippets.org/snippets/365/
from django.http import HttpResponse
from django.core.servers.basehttp import FileWrapper

def send_file(path, content_type):
"""
Send a file through Django without loading the whole file into
memory at once. The FileWrapper will turn the file object into an
iterator for chunks of 8KB.
"""
wrapper = FileWrapper(file(path))
response = HttpResponse(wrapper, content_type=content_type)
response['Content-Length'] = os.path.getsize(path)
return response

from deny_if_unauthorized import deny_if_unauthorized

@deny_if_unauthorized()
def csv_export(request, id_string):
dd = DataDictionary.objects.get(xform__id_string=id_string)
ddw = DataDictionaryWriter()
ddw.set_data_dictionary(dd)
file_path = os.path.join("csvs", id_string + ".csv")
ddw.write_to_file(file_path)
return send_file(path=file_path, content_type="application/csv")
Loading

0 comments on commit 3065b5b

Please sign in to comment.