Skip to content

Commit

Permalink
Move html to new file in portia server to reduce file size
Browse files Browse the repository at this point in the history
  • Loading branch information
ruairif committed Dec 14, 2016
1 parent e41002d commit 7fdf275
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 172 deletions.
32 changes: 27 additions & 5 deletions portia_server/portia_api/resources/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ class SampleSchema(SlydSchema):
page_type = fields.Str(default='item')
scrapes = fields.Str()
extractors = fields.Dict(default={})
original_body = fields.Str(default='')
annotated_body = fields.Str(default='')
project = fields.Relationship(
related_url='/api/projects/{project_id}',
related_url_kwargs={'project_id': '<project_id>'},
Expand All @@ -212,13 +210,21 @@ class SampleSchema(SlydSchema):
'spider_id': '<spider_id>'},
type_='spiders', include_resource_linkage=True
)
html = fields.Relationship(
original_body = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/html',
'{sample_id}/original_body',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<id>'},
type_='html', include_resource_linkage=True
type_='html', include_resource_linkage=False
)
rendered_body = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/rendered_body',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<id>'},
type_='html', include_resource_linkage=False
)
items = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
Expand Down Expand Up @@ -370,6 +376,22 @@ class Meta:
type_ = 'html'


class RenderedBody(SlydSchema):
id = fields.Str()
html = fields.Str()

class Meta:
type_ = 'rendered-bodys'


class OriginalBody(SlydSchema):
id = fields.Str()
html = fields.Str()

class Meta:
type_ = 'original-bodys'


class ItemSchema(SlydSchema):
"""Instance of a schema. Meta item built from sample."""
id = fields.Str()
Expand Down
33 changes: 32 additions & 1 deletion portia_server/portia_api/resources/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from portia_orm.base import AUTO_PK
from portia_orm.exceptions import ProtectedError
from portia_orm.models import (Project, Schema, Field, Extractor, Spider,
Sample, Item, Annotation)
Sample, Item, Annotation, RenderedBody,
OriginalBody)
from portia_api.utils.projects import unique_name


Expand Down Expand Up @@ -338,3 +339,33 @@ def update(self, instance, validated_data):
clear_auto_created(new_field)

return instance


class RenderedBodySerializer(JsonApiSerializer):
class Meta:
model = RenderedBody
url = ('/api/projects/{self.sample.spider.project.id}/'
'spiders/{self.sample.spider.id}/samples/'
'{self.sample.id}/rendered_body')
links = {
'sample': {
'related': ('/api/projects/{self.sample.spider.project.id}/'
'spiders/{self.sample.spider.id}/samples/'
'{self.sample.id}'),
},
}

class OriginalBodySerializer(JsonApiSerializer):
class Meta:
model = OriginalBody
url = ('/api/projects/{self.sample.spider.project.id}/'
'spiders/{self.sample.spider.id}/samples/'
'{self.sample.id}/original_body')
links = {
'sample': {
'related': ('/api/projects/{self.sample.spider.project.id}/'
'spiders/{self.sample.spider.id}/samples/'
'{self.sample.id}'),
},
}

24 changes: 18 additions & 6 deletions portia_server/portia_orm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ def __init__(self, meta, model):
if not isinstance(self.polymorphic, (bool, string_types)):
raise ValueError(
"'polymorphic' option must be a string or boolean.")
self.raw = getattr(meta, 'raw', False)
if not isinstance(self.raw, bool):
raise ValueError("'raw' option must be a boolean")
self.single = getattr(meta, 'single', False)
if not isinstance(self.single, bool):
raise ValueError("'single' option must be a boolean")
self.ignore_if_missing = getattr(meta, 'ignore_if_missing', False)
if not isinstance(self.ignore_if_missing, bool):
raise ValueError("'ignore_if_missing' option must be a boolean")


class ModelMeta(type):
Expand Down Expand Up @@ -440,7 +449,7 @@ def _staged_model_references(self, load_relationships=False):
else:
value = self.data_store.get(
name, ('staged', 'committed'))
except (AttributeError, KeyError):
except (AttributeError, KeyError, PathResolutionError):
continue
if value is None:
continue
Expand Down Expand Up @@ -488,7 +497,7 @@ def _commit_delete(self, collector, saved_paths=None, deleted_paths=None):
for model in collector.delete:
path = model.storage_path(model, snapshots=('committed',))
if model.opts.owner:
if path not in saved_paths and path not in deleted_paths:
if path and path not in saved_paths and path not in deleted_paths:
to_save = self._get_object_to_dump(
model, parent_snapshots=('committed',))
model.storage.save(path, ContentFile(
Expand Down Expand Up @@ -518,7 +527,7 @@ def load(cls, storage, instance=None, **kwargs):
if not path:
return

many = bool(cls.opts.owner)
many = bool(cls.opts.owner) and not cls.opts.single
if instance and many:
try:
instance.data_store.get(instance._pk_field)
Expand All @@ -534,8 +543,9 @@ def load(cls, storage, instance=None, **kwargs):
return cls.collection()
return instance # may be None

file_data = json.loads(storage.open(path).read(),
object_pairs_hook=OrderedDict)
file_data = storage.open(path).read()
if not cls.opts.raw:
file_data = json.loads(file_data, object_pairs_hook=OrderedDict)

if cls.opts.polymorphic:
if not many:
Expand All @@ -555,7 +565,7 @@ def load(cls, storage, instance=None, **kwargs):

file_schema = cls._file_model.file_schema
result = file_schema(
context={'storage': storage}).load(
context={'storage': storage, 'path': path}).load(
file_data, many=many).data
return result

Expand All @@ -568,6 +578,8 @@ def storage_path(cls, data, snapshots=None):
try:
path = (cls.opts.path or u'').format(self=data)
except AttributeError as e:
if cls.opts.ignore_if_missing:
return
raise PathResolutionError(
u"Could not resolve file path for model '{}':\n"
u" {}".format(cls.__name__, e))
Expand Down
3 changes: 2 additions & 1 deletion portia_server/portia_orm/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .collection import ListDescriptor
from .deletion import CASCADE, CLEAR, PROTECT
from .exceptions import ImproperlyConfigured, ValidationError
from .relationships import BelongsTo, HasMany
from .relationships import BelongsTo, HasMany, HasOne
from .validators import OneOf

__all__ = [
Expand All @@ -21,6 +21,7 @@
'Url',
'BelongsTo',
'HasMany',
'HasOne',
'StartUrl',
'CASCADE',
'CLEAR',
Expand Down
75 changes: 65 additions & 10 deletions portia_server/portia_orm/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import six

from collections import deque, OrderedDict

Expand All @@ -13,13 +14,15 @@
load_annotations)
from slybot.starturls import StartUrlCollection

from storage.backends import ContentFile

from .base import Model
from .decorators import pre_load, post_dump
from .decorators import pre_load, post_dump, post_load
from .exceptions import PathResolutionError
from .fields import (
Boolean, Domain, Integer, List, Regexp, String, Url, DependantField,
BelongsTo, HasMany, CASCADE, CLEAR, PROTECT, StartUrl)
from .utils import unwrap_envelopes, short_guid, wrap_envelopes
BelongsTo, HasMany, HasOne, CASCADE, CLEAR, PROTECT, StartUrl)
from .utils import unwrap_envelopes, short_guid, wrap_envelopes, encode
from .validators import OneOf

FIELD_TYPES = FieldTypeManager().available_type_names()
Expand Down Expand Up @@ -317,14 +320,13 @@ class Sample(Model, OrderedAnnotationsMixin):
url = Url(required=True)
page_id = String(default='')
page_type = String(default='item', validate=OneOf(['item']))
original_body = String(default='')
annotated_body = String(default='')
rendered_body = String(default='')
body = String(default='original_body',
validate=OneOf(['original_body', 'rendered_body']))
spider = BelongsTo(Spider, related_name='samples', on_delete=CASCADE,
only='id')
items = HasMany('Item', related_name='sample', on_delete=CLEAR)
original_body = HasOne('OriginalBody', related_name='sample',
on_delete=CLEAR, ignore_in_file=True)
rendered_body = HasOne('RenderedBody', related_name='sample',
on_delete=CLEAR, ignore_in_file=True)

class Meta:
path = u'spiders/{self.spider.id}/{self.id}.json'
Expand Down Expand Up @@ -358,7 +360,6 @@ def chain_load(self, data):

@staticmethod
def migrate_sample(self, data):
data['body'] = data.get('body') or 'original_body'
if not data.get('name'):
data['name'] = data.get('id', data.get('page_id', u'')[:20])
if data.get('version', '') >= '0.13.1':
Expand Down Expand Up @@ -449,6 +450,19 @@ def _add_schemas(serializer, schemas):
schema_collection.add(model)
project.schemas = schema_collection

@post_load
def _migrate_html(self, sample):
for key, value in sample.items():
if not key.endswith('_body'):
continue
path = self.context['path']
path = '/'.join((path[:-len('.json')].strip('/'),
'{}.html'.format(key)))
html = value.html.encode('utf-8')
if hasattr(html, 'encode') and isinstance(html, six.text_type):
html = encode(html).decode('utf-8')
self.context['storage'].save(path, ContentFile(html, path))

@post_dump
def add_fields(self, data):
items = data.pop('items', [])
Expand Down Expand Up @@ -481,7 +495,6 @@ def add_fields(self, data):
scrapes = annotation.get('schema_id')
if scrapes:
break

data.update({
'extractors': data.get('extractors', {}),
'plugins': {
Expand Down Expand Up @@ -721,3 +734,45 @@ def set_annotation_data(self, data):
('tagid', None),
('xpath', data['xpath']),
])


class OriginalBody(Model):
id = String(primary_key=True)
html = String(default='')
sample = BelongsTo(Sample, related_name='original_body', on_delete=CASCADE,
ignore_in_file=True)

@pre_load
def populate_item(self, data):
split_path = self.context['path'].split('/')
sample_id = split_path[2]
if len(split_path) == 3 and sample_id.endswith('.json'):
sample_id = sample_id[:-len('.json')]
name = self.Meta.name
return {
'id': '{}_{}'.format(sample_id, name),
'html': data,
}

@post_dump
def return_html(self, data):
return data['html']

class Meta:
owner = 'sample'
raw = True
single = True
path = (u'spiders/{self.sample.spider.id}/{self.sample.id}/'
u'original_body.html')
name = 'original_body'


class RenderedBody(OriginalBody):
sample = BelongsTo(Sample, related_name='rendered_body', on_delete=CASCADE,
ignore_in_file=True)

class Meta:
ignore_if_missing = True
path = (u'spiders/{self.sample.spider.id}/{self.sample.id}/'
u'rendered_body.html')
name = 'rendered_body'
20 changes: 20 additions & 0 deletions portia_server/portia_orm/relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
__all__ = [
'BelongsTo',
'HasMany',
'HasOne'
]


Expand Down Expand Up @@ -89,6 +90,21 @@ def replace_collection(self, collection, values):
collection.update(values)


class HasOneDescriptor(BelongsToDescriptor, BaseRelationshipDescriptor):
def __get__(self, instance, instance_type=None):
try:
field = instance.get_data(self.attrname)
assert field is not None
except (AttributeError, AssertionError):
field = self.model.load(instance.storage, **{
self.related_name: instance
})
if field and not getattr(field, self.related_name, None):
setattr(field, self.related_name, instance)
instance.data_store.set(self.attrname, field, 'committed')
return field


class BaseRelationship(fields.Nested):
descriptor_class = None

Expand Down Expand Up @@ -234,3 +250,7 @@ class HasMany(BaseRelationship):
def __init__(self, *args, **kwargs):
kwargs['many'] = True
super(HasMany, self).__init__(*args, **kwargs)


class HasOne(BaseRelationship):
descriptor_class = HasOneDescriptor
Loading

0 comments on commit 7fdf275

Please sign in to comment.