feat(dev): moved output buffers into writer methods;

- Moved output buffers into the datastore writer methods. - Made write record methods return bytes. - Added end file method for returning final bytes of the files.
toothstone · Oct 13, 2023 · 0f3ea9a · 0f3ea9a
1 parent 06b654c
commit 0f3ea9a
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 43 deletions.
diff --git a/ckanext/datastore/blueprint.py b/ckanext/datastore/blueprint.py
@@ -3,7 +3,6 @@
 
 from typing import Any, Optional, cast, Union
 from itertools import zip_longest
-from io import BytesIO
 
 from flask import Blueprint, Response
 from flask.views import MethodView
@@ -207,8 +206,6 @@ def dump_to(
  limit: Optional[int], options: dict[str, Any], sort: str,
  search_params: dict[str, Any], user: str
 ):
- output_buffer = BytesIO()
-
  if fmt == 'csv':
  writer_factory = csv_writer
  records_format = 'csv'
@@ -226,9 +223,8 @@ def dump_to(
 
  bom = options.get('bom', False)
 
- def start_stream_writer(output_buffer: BytesIO,
- fields: list[dict[str, Any]]):
- return writer_factory(output_buffer, fields, bom=bom)
+ def start_stream_writer(fields: list[dict[str, Any]]):
+ return writer_factory(fields, bom=bom)
 
  def stream_result_page(offs: int, lim: Union[None, int]):
  return get_action('datastore_search')(
@@ -246,18 +242,14 @@ def stream_result_page(offs: int, lim: Union[None, int]):
 
  def stream_dump(offset: int, limit: Union[None, int],
  paginate_by: int, result: dict[str, Any]):
- with start_stream_writer(output_buffer, result['fields']) as output:
+ with start_stream_writer(result['fields']) as writer:
  while True:
  if limit is not None and limit <= 0:
  break
 
  records = result['records']
 
- output.write_records(records)
- output_buffer.seek(0)
- yield output_buffer.read()
- output_buffer.truncate(0)
- output_buffer.seek(0)
+ yield writer.write_records(records)
 
  if records_format == 'objects' or records_format == 'lists':
  if len(records) < paginate_by:
@@ -272,8 +264,8 @@ def stream_dump(offset: int, limit: Union[None, int],
  break
 
  result = stream_result_page(offset, limit)
- output_buffer.seek(0)
- yield output_buffer.read()
+
+  yield writer.end_file()
 
  result = stream_result_page(offset, limit)
 

diff --git a/ckanext/datastore/tests/test_dump.py b/ckanext/datastore/tests/test_dump.py
@@ -426,7 +426,7 @@ def test_dump_xml(self, app):
 
  res = app.get(f"/datastore/dump/{resource['id']}?limit=1&format=xml")
  expected_content = (
- u"<data>\n"
+ u'<data xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n'
  r'<row _id="1">'
  u"<b\xfck>annakarenina</b\xfck>"
  u"<author>tolstoy</author>"

diff --git a/ckanext/datastore/writer.py b/ckanext/datastore/writer.py
@@ -1,7 +1,7 @@
 # encoding: utf-8
 from __future__ import annotations
 
-from io import BytesIO
+from io import StringIO, BytesIO
 
 from contextlib import contextmanager
 from typing import Any, Optional
@@ -14,113 +14,132 @@
 from codecs import BOM_UTF8
 
 
+BOM = "\N{bom}"
+
+
 @contextmanager
-def csv_writer(output: BytesIO, fields: list[dict[str, Any]],
- bom: bool = False):
+def csv_writer(fields: list[dict[str, Any]], bom: bool = False):
  '''Context manager for writing UTF-8 CSV data to file
 
  :param response: file-like object for writing data
  :param fields: list of datastore fields
  :param bom: True to include a UTF-8 BOM at the start of the file
  '''
+ output = StringIO()
 
  if bom:
- output.write(BOM_UTF8)
+ output.write(BOM)
 
  csv.writer(output).writerow( # type: ignore
- f['id'].encode('utf-8') for f in fields)
+ f['id'] for f in fields)
  yield TextWriter(output)
 
 
 @contextmanager
-def tsv_writer(output: BytesIO, fields: list[dict[str, Any]],
- bom: bool = False):
+def tsv_writer(fields: list[dict[str, Any]], bom: bool = False):
  '''Context manager for writing UTF-8 TSV data to file
 
  :param response: file-like object for writing data
  :param fields: list of datastore fields
  :param bom: True to include a UTF-8 BOM at the start of the file
  '''
+ output = StringIO()
 
  if bom:
- output.write(BOM_UTF8)
+ output.write(BOM)
 
  csv.writer(
  output, # type: ignore
  dialect='excel-tab').writerow(
- f['id'].encode('utf-8') for f in fields)
+ f['id'] for f in fields)
  yield TextWriter(output)
 
 
 class TextWriter(object):
  'text in, text out'
- def __init__(self, output: BytesIO):
+ def __init__(self, output: StringIO):
  self.output = output
 
- def write_records(self, records: list[Any]):
+ def write_records(self, records: list[Any]) -> bytes:
  self.output.write(records) # type: ignore
+ self.output.seek(0)
+ output = self.output.read().encode('utf-8')
+ self.output.truncate(0)
+ self.output.seek(0)
+ return output
+
+ def end_file(self) -> bytes:
+ return b''
 
 
 @contextmanager
-def json_writer(output: BytesIO, fields: list[dict[str, Any]],
- bom: bool = False):
+def json_writer(fields: list[dict[str, Any]], bom: bool = False):
  '''Context manager for writing UTF-8 JSON data to file
 
  :param response: file-like object for writing data
  :param fields: list of datastore fields
  :param bom: True to include a UTF-8 BOM at the start of the file
  '''
+ output = StringIO()
 
  if bom:
- output.write(BOM_UTF8)
+ output.write(BOM)
+
  output.write(
- b'{\n "fields": %s,\n "records": [' % dumps(
- fields, ensure_ascii=False, separators=(',', ':')).encode('utf-8'))
+ '{\n "fields": %s,\n "records": [' % dumps(
+ fields, ensure_ascii=False, separators=(',', ':')))
  yield JSONWriter(output)
- output.write(b'\n]}\n')
 
 
 class JSONWriter(object):
- def __init__(self, output: BytesIO):
+ def __init__(self, output: StringIO):
  self.output = output
  self.first = True
 
- def write_records(self, records: list[Any]):
+ def write_records(self, records: list[Any]) -> bytes:
  for r in records:
  if self.first:
  self.first = False
- self.output.write(b'\n ')
+ self.output.write('\n ')
  else:
- self.output.write(b',\n ')
+ self.output.write(',\n ')
 
  self.output.write(dumps(
- r, ensure_ascii=False, separators=(',', ':'))
- .encode('utf-8'))
+ r, ensure_ascii=False, separators=(',', ':')))
+
+ self.output.seek(0)
+ output = self.output.read().encode('utf-8')
+ self.output.truncate(0)
+ self.output.seek(0)
+ return output
+
+ def end_file(self) -> bytes:
+ return b'\n]}\n'
 
 
 @contextmanager
-def xml_writer(output: BytesIO, fields: list[dict[str, Any]],
- bom: bool = False):
+def xml_writer(fields: list[dict[str, Any]], bom: bool = False):
  '''Context manager for writing UTF-8 XML data to file
 
  :param response: file-like object for writing data
  :param fields: list of datastore fields
  :param bom: True to include a UTF-8 BOM at the start of the file
  '''
+ output = BytesIO()
 
  if bom:
  output.write(BOM_UTF8)
+
  output.write(
  b'<data xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n')
  yield XMLWriter(output, [f['id'] for f in fields])
- output.write(b'</data>\n')
 
 
 class XMLWriter(object):
  _key_attr = 'key'
  _value_tag = 'value'
 
- def __init__(self, output: BytesIO, columns: list[str]):
+ def __init__(self, output: StringIO, columns: list[str]):
  self.output = output
  self.id_col = columns[0] == '_id'
  if self.id_col:
@@ -145,7 +164,7 @@ def _insert_node(self, root: Any, k: str, v: Any,
  if key_attr is not None:
  element.attrib[self._key_attr] = str(key_attr)
 
- def write_records(self, records: list[Any]):
+ def write_records(self, records: list[Any]) -> bytes:
  for r in records:
  root = Element('row')
  if self.id_col:
@@ -154,3 +173,12 @@ def write_records(self, records: list[Any]):
  self._insert_node(root, c, r[c])
  ElementTree(root).write(self.output, encoding='utf-8')
  self.output.write(b'\n')
+ self.output.seek(0)
+ output = self.output.read()
+ self.output.truncate(0)
+ self.output.seek(0)
+ return output
+
+ def end_file(self) -> bytes:
+ return b'</data>\n'
+