Skip to content

Commit

Permalink
POST request handling and indexing improvements (webrecorder#636)
Browse files Browse the repository at this point in the history
* post append improvements:
- parse json primitives for post query
- for text/plain, attempt to parse as json, then as binary
- standardize post append indexing
- include '__wb_method' in urlkey
- add 'requestBody' and 'method' to cdxj
- support unique dupe params for json-to-query conversion

* test fixes:
- update tests for test_inputreq,
- update post-test.cdxj and post-test.cdx

* ci: fixes
- tox: run full test suite!
- disable appveyor

* inputrequest buffering fix:
- never truncate reading POST request, must read entire POST data to avoid hung request in live mode
- truncate final query string to 4096
  • Loading branch information
ikreymer authored Apr 28, 2021
1 parent 106a9e9 commit 626da99
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 55 deletions.
File renamed without changes.
3 changes: 3 additions & 0 deletions pywb/indexer/archiveindexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def merge_request_data(self, other, options):
self['urlkey'] = canonicalize(new_url, surt_ordered)
other['urlkey'] = self['urlkey']

self['method'] = post_query.method
self['requestBody'] = post_query.query

referer = other.record.http_headers.get_header('referer')
if referer:
self['_referer'] = referer
Expand Down
26 changes: 13 additions & 13 deletions pywb/indexer/test/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@
# post append
>>> print_cdx_index('post-test.warc.gz', append_post=True)
CDX N b a m s k r M S V g
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
# no post append, requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True)
Expand All @@ -118,12 +118,12 @@
# post append + requests included
>>> print_cdx_index('post-test.warc.gz', include_all=True, append_post=True)
CDX N b a m s k r M S V g
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 720 post-test.warc.gz
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/x-www-form-urlencoded - - - - 476 1919 post-test.warc.gz
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/x-www-form-urlencoded - - - - 475 3118 post-test.warc.gz
# post append + minimal = error
>>> print_cdx_index('example.arc.gz', append_post=True, minimal=True)
Expand Down Expand Up @@ -509,8 +509,8 @@ def test_multipart_form():
print(buff.getvalue())
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,example)/ajax/bz?foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar unk text/html; 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 420 0 test.warc.gz
com,example)/ajax/bz?foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar multipart/form-data - - - - 701 428 test.warc.gz
com,example)/ajax/bz?__wb_method=post&foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar unk text/html; 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 420 0 test.warc.gz
com,example)/ajax/bz?__wb_method=post&foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar multipart/form-data - - - - 701 428 test.warc.gz
"""


Expand Down Expand Up @@ -556,8 +556,8 @@ def test_multipart_form_no_boundary():
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,connatix,capi)/core/story?__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 unk multipart/form-data SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5 - - 453 0 test.warc.gz
com,connatix,capi)/core/story?__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 multipart/form-data - - - - 500 461 test.warc.gz
com,connatix,capi)/core/story?__wb_method=post&__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 unk multipart/form-data SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5 - - 453 0 test.warc.gz
com,connatix,capi)/core/story?__wb_method=post&__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 multipart/form-data - - - - 500 461 test.warc.gz
"""


Expand Down
7 changes: 6 additions & 1 deletion pywb/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,16 @@ rules:
- action_load_comments
- filter

- url_prefix: ['com,youtube)/youtubei', 'com,youtube-nocookie)/youtubei']
- url_prefix: ['com,youtube)/embed', 'com,youtube-nocookie)/embed']

fuzzy_lookup:
match: '()'

- url_prefix: ['com,youtube)/youtubei/v1', 'com,youtube-nocookie)/youtubei/v1']

fuzzy_lookup:
- videoid

- url_prefix: 'com,googlevideo,'

fuzzy_lookup:
Expand Down
2 changes: 1 addition & 1 deletion pywb/static/wombat.js

Large diffs are not rendered by default.

71 changes: 55 additions & 16 deletions pywb/warcserver/inputrequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import base64
import cgi
import json


#=============================================================================
Expand Down Expand Up @@ -77,7 +78,7 @@ def include_method_query(self, url):

method = self.get_req_method()

if method not in ('OPTIONS', 'POST'):
if method == 'GET' or method == 'HEAD':
return url

mime = self._get_content_type()
Expand Down Expand Up @@ -181,7 +182,8 @@ def _get_header(self, name):

# ============================================================================
class MethodQueryCanonicalizer(object):
MAX_POST_SIZE = 16384
#MAX_POST_SIZE = 16384
MAX_QUERY_LENGTH = 4096

def __init__(self, method, mime, length, stream,
buffered_stream=None,
Expand All @@ -196,12 +198,9 @@ def __init__(self, method, mime, length, stream,
self.query = b''

method = method.upper()
self.method = method

if method in ('OPTIONS', 'HEAD'):
self.query = '__pywb_method=' + method.lower()
return

if method != 'POST':
if method != 'POST' and method != 'PUT':
return

try:
Expand All @@ -212,8 +211,8 @@ def __init__(self, method, mime, length, stream,
if length <= 0:
return

# max POST query allowed, for size considerations, only read upto this size
length = min(length, self.MAX_POST_SIZE)
# always read entire POST request, but limit query string later
#length = min(length, self.MAX_POST_SIZE)
query = []

while length > 0:
Expand Down Expand Up @@ -274,12 +273,26 @@ def handle_binary(query):
elif mime.startswith('application/x-amf'):
query = self.amf_parse(query, environ)

elif mime.startswith('application/json'):
try:
query = self.json_parse(query)
except Exception as e:
print(e)
query = ''

elif mime.startswith('text/plain'):
try:
query = self.json_parse(query)
except Exception as e:
query = handle_binary(query)

else:
query = handle_binary(query)

self.query = query
if query:
self.query = query[:self.MAX_QUERY_LENGTH]

def amf_parse(self, string, environ):
def amf_parse(self, string, warn_on_error):
try:
res = decode(BytesIO(string))
return urlencode({"request": Amf.get_representation(res)})
Expand All @@ -290,15 +303,41 @@ def amf_parse(self, string, environ):
print(e)
return None

def json_parse(self, string):
data = {}
dupes = {}

def get_key(n):
if n not in data:
return n

if n not in dupes:
dupes[n] = 1

dupes[n] += 1
return n + "." + str(dupes[n]) + "_";

def _parser(dict_var):
for n, v in dict_var.items():
if isinstance(v, dict):
_parser(v)
else:
data[get_key(n)] = str(v)

_parser(json.loads(string))
return urlencode(data)

def append_query(self, url):
if not self.query:
if self.method == 'GET':
return url

if '?' not in url:
url += '?'
append_str = '?'
else:
url += '&'
append_str = '&'

url += self.query
return url
append_str += "__wb_method=" + self.method
if self.query:
append_str += '&' + self.query

return url + append_str
36 changes: 22 additions & 14 deletions pywb/warcserver/test/test_inputreq.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,74 +89,82 @@ def test_post_extract_1(self):
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.post_data), BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz'

assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&__wb_method=POST&foo=bar&dir=/baz'

def test_post_extract_wrong_method(self):
def test_post_extract_json(self):
post_data = b'{"a": "b", "c": {"a": 2}, "d": "e"}'
mq = MethodQueryCanonicalizer('POST', 'application/json',
len(post_data), BytesIO(post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e'


def test_put_extract_method(self):
mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded',
len(self.post_data), BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=PUT&foo=bar&dir=/baz'

def test_post_extract_non_form_data_1(self):
mq = MethodQueryCanonicalizer('POST', 'application/octet-stream',
len(self.post_data), BytesIO(self.post_data))

#base64 encoded data
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'

def test_post_extract_non_form_data_2(self):
mq = MethodQueryCanonicalizer('POST', 'text/plain',
len(self.post_data), BytesIO(self.post_data))

#base64 encoded data
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'

def test_post_extract_length_invalid_ignore(self):
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
0, BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST'

mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
'abc', BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST'

def test_post_extract_length_too_short(self):
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.post_data) - 4, BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=%2'

def test_post_extract_length_too_long(self):
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.post_data) + 4, BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&foo=bar&dir=/baz'

def test_post_extract_malformed_form_data(self):
mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded',
len(self.binary_post_data), BytesIO(self.binary_post_data))

#base64 encoded data
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=gTZsYEygNFAO4HICtYkZAGZQ2w6wAiw='
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=gTZsYEygNFAO4HICtYkZAGZQ2w6wAiw='

def test_post_extract_no_boundary_in_multipart_form_mimetype(self):
mq = MethodQueryCanonicalizer('POST', 'multipart/form-data',
len(self.post_data), BytesIO(self.post_data))

assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'


def test_options(self):
mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO())
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=OPTIONS'

def test_head(self):
mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO())
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=head'
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_method=HEAD'

def test_amf_parse(self):
mq = MethodQueryCanonicalizer('POST', 'application/x-amf', 0, BytesIO())
Expand Down
6 changes: 3 additions & 3 deletions sample_archive/cdx/post-test.cdx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CDX N b a m s k r M S V g
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 http://httpbin.org/post application/json 200 M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2 - - 723 1196 post-test.warc.gz
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 http://httpbin.org/post?foo=bar application/json 200 B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ - - 723 2395 post-test.warc.gz
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 http://httpbin.org/post application/json 200 M532K5WS4GY2H4OVZO6HRPOP47A7KDWU - - 720 0 post-test.warc.gz
6 changes: 3 additions & 3 deletions sample_archive/cdxj/post-test.cdxj
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
org,httpbin)/post?foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"}
org,httpbin)/post?a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"}
org,httpbin)/post?data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"}
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M532K5WS4GY2H4OVZO6HRPOP47A7KDWU", "length": "720", "offset": "0", "filename": "post-test.warc.gz"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url": "http://httpbin.org/post", "mime": "application/json", "status": "200", "digest": "M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2", "length": "723", "offset": "1196", "filename": "post-test.warc.gz"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url": "http://httpbin.org/post?foo=bar", "mime": "application/json", "status": "200", "digest": "B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ", "length": "723", "offset": "2395", "filename": "post-test.warc.gz"}
10 changes: 8 additions & 2 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,9 +389,15 @@ def test_post_2(self, fmod):
assert resp.status_int == 200
assert '"data": "^"' in resp.text

def test_post_match_as_json(self, fmod):
# json also matches same query
resp = self.post_json('/pywb/20140610001255{0}/http://httpbin.org/post?foo=bar', fmod, {'data': '^'})
assert resp.status_int == 200
assert '"data": "^"' in resp.text

def test_post_invalid(self, fmod):
# not json
resp = self.post_json('/pywb/20140610001255{0}/http://httpbin.org/post?foo=bar', fmod, {'data': '^'}, status=404)
# wrong param
resp = self.post('/pywb/20140610001255{0}/http://httpbin.org/post?foo=bar', fmod, {'data': '^^'}, status=404)
assert resp.status_int == 404

def test_post_referer_redirect(self, fmod):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_record_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_cdx_all_coll(self):
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
assert cdxj_lines[3]['url'] == 'http://httpbin.org/get?C=D2'

assert cdxj_lines[0]['urlkey'] == 'org,httpbin)/get?__pywb_method=head&a=b'
assert cdxj_lines[0]['urlkey'] == 'org,httpbin)/get?__wb_method=head&a=b'
assert cdxj_lines[1]['urlkey'] == 'org,httpbin)/get?a=b'
assert cdxj_lines[2]['urlkey'] == 'org,httpbin)/get?c=d'
assert cdxj_lines[3]['urlkey'] == 'org,httpbin)/get?c=d2'
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ deps =
-rrequirements.txt
-rextra_requirements.txt
commands =
py.test
py.test --cov-config .coveragerc --cov pywb -v --doctest-modules ./pywb/ tests/


0 comments on commit 626da99

Please sign in to comment.