Skip to content

Commit

Permalink
Add UrlEncoder for decode Url Path
Browse files Browse the repository at this point in the history
The UrlEncoder provides two methods to decode Url path. One decode the
raw path in place and the other save output to the given byte span. The
decoder will unescapse the path string.
  • Loading branch information
troydai committed Oct 4, 2016
1 parent 9b3e31c commit 54e99ac
Show file tree
Hide file tree
Showing 8 changed files with 537 additions and 8 deletions.
9 changes: 9 additions & 0 deletions corefxlab.sln
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,13 @@ Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Collections.Sequence
EndProject
Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Collections.Sequences.Tests", "tests\System.Collections.Sequences.Tests\System.Collections.Sequences.Tests.xproj", "{8EC131AE-D04F-4B7D-B223-D82821D8AFCC}"
EndProject
<<<<<<< 9b3e31c0a9209564c78700d5120dfad2690b1b86
Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Text.Json.Dynamic", "src\System.Text.Json.Dynamic\System.Text.Json.Dynamic.xproj", "{BB6D79C1-783F-4B87-A281-5EAB22CA7BF0}"
EndProject
Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Text.Json.Dynamic.Tests", "tests\System.Text.Json.Tests.Dynamic\System.Text.Json.Dynamic.Tests.xproj", "{B6FBB81F-241F-4603-9510-E269F843F6CB}"
=======
Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Text.Encodings.Web.Utf8.Tests", "tests\System.Text.Encodings.Web.Utf8.Tests\System.Text.Encodings.Web.Utf8.Tests.xproj", "{D38CF672-795E-41D2-B10D-CFA87927BBDC}"
>>>>>>> Add UrlEncoder.UnescapeInPlace
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -309,6 +313,10 @@ Global
{B6FBB81F-241F-4603-9510-E269F843F6CB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B6FBB81F-241F-4603-9510-E269F843F6CB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B6FBB81F-241F-4603-9510-E269F843F6CB}.Release|Any CPU.Build.0 = Release|Any CPU
{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -362,5 +370,6 @@ Global
{8EC131AE-D04F-4B7D-B223-D82821D8AFCC} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
{BB6D79C1-783F-4B87-A281-5EAB22CA7BF0} = {4B000021-5278-4F2A-B734-DE49F55D4024}
{B6FBB81F-241F-4603-9510-E269F843F6CB} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
{D38CF672-795E-41D2-B10D-CFA87927BBDC} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
EndGlobalSection
EndGlobal
327 changes: 319 additions & 8 deletions src/System.Text.Encodings.Web.Utf8/UrlEncoder.cs
Original file line number Diff line number Diff line change
@@ -1,19 +1,330 @@
using System;
using System.Collections.Generic;
using System.Threading.Tasks;

namespace System.Text.Encodings.Web.Utf8
namespace System.Text.Encodings.Web.Utf8
{
public class UrlEncoder
{
public static void UnescapeInPlace(Span<byte> data)
/// <summary>
/// Unescape a URL path
/// </summary>
/// <param name="source">The byte span represents a UTF8 encoding url path.</param>
/// <param name="destination">The byte span where unescaped url path is copied to.</param>
/// <returns>The length of the byte sequence of the unescaped url path.</returns>
public static int Decode(ReadOnlySpan<byte> source, Span<byte> destination)
{
if (destination.Length < source.Length)
{
throw new ArgumentException(
"Lenghth of the destination byte span is less then the source.",
nameof(destination));
}

// This requires the destination span to be larger or equal to source span
source.CopyTo(destination);
return DecodeInPlace(destination);
}

/// <summary>
/// Unescape a URL path in place.
/// </summary>
/// <param name="buffer">The byte span represents a UTF8 encoding url path.</param>
/// <returns>The number of the bytes representing the result.</returns>
/// <remarks>
/// The unescape is done in place, which means after decoding the result is the subset of
/// the input span.
/// </remarks>
public static int DecodeInPlace(Span<byte> buffer)
{
// the slot to read the input
var sourceIndex = 0;

// the slot to write the unescaped byte
var destinationIndex = 0;

while (true)
{
if (sourceIndex == buffer.Length)
{
break;
}

if (buffer[sourceIndex] == '%')
{
var decodeIndex = sourceIndex;

// If decoding process succeeds, the writer iterator will be moved
// to the next write-ready location. On the other hand if the scanned
// percent-encodings cannot be interpreted as sequence of UTF-8 octets,
// these bytes should be copied to output as is.
// The decodeReader iterator is always moved to the first byte not yet
// be scanned after the process. A failed decoding means the chars
// between the reader and decodeReader can be copied to output untouched.
if (!DecodeCore(ref decodeIndex, ref destinationIndex, buffer))
{
Copy(sourceIndex, decodeIndex, ref destinationIndex, buffer);
}

sourceIndex = decodeIndex;
}
else
{
buffer[destinationIndex++] = buffer[sourceIndex++];
}
}

return destinationIndex;
}

/// <summary>
/// Unescape the percent-encodings
/// </summary>
/// <param name="sourceIndex">The iterator point to the first % char</param>
/// <param name="destinationIndex">The place to write to</param>
/// <param name="end">The end of the buffer</param>
/// <param name="buffer">The byte array</param>
private static bool DecodeCore(ref int sourceIndex, ref int destinationIndex, Span<byte> buffer)
{
// preserves the original head. if the percent-encodings cannot be interpreted as sequence of UTF-8 octets,
// bytes from this till the last scanned one will be copied to the memory pointed by writer.
var byte1 = UnescapePercentEncoding(ref sourceIndex, buffer);
if (byte1 == -1)
{
return false;
}

if (byte1 == 0)
{
throw new InvalidOperationException("The path contains null characters.");
}

if (byte1 <= 0x7F)
{
// first byte < U+007f, it is a single byte ASCII
buffer[destinationIndex++] = (byte)byte1;
return true;
}

int byte2 = 0, byte3 = 0, byte4 = 0;

// anticipate more bytes
var currentDecodeBits = 0;
var byteCount = 1;
var expectValueMin = 0;
if ((byte1 & 0xE0) == 0xC0)
{
// 110x xxxx, expect one more byte
currentDecodeBits = byte1 & 0x1F;
byteCount = 2;
expectValueMin = 0x80;
}
else if ((byte1 & 0xF0) == 0xE0)
{
// 1110 xxxx, expect two more bytes
currentDecodeBits = byte1 & 0x0F;
byteCount = 3;
expectValueMin = 0x800;
}
else if ((byte1 & 0xF8) == 0xF0)
{
// 1111 0xxx, expect three more bytes
currentDecodeBits = byte1 & 0x07;
byteCount = 4;
expectValueMin = 0x10000;
}
else
{
// invalid first byte
return false;
}

var remainingBytes = byteCount - 1;
while (remainingBytes > 0)
{
// read following three chars
if (sourceIndex == buffer.Length)
{
return false;
}

var nextSourceIndex = sourceIndex;
var nextByte = UnescapePercentEncoding(ref nextSourceIndex, buffer);
if (nextByte == -1)
{
return false;
}

if ((nextByte & 0xC0) != 0x80)
{
// the follow up byte is not in form of 10xx xxxx
return false;
}

currentDecodeBits = (currentDecodeBits << 6) | (nextByte & 0x3F);
remainingBytes--;

if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F)
{
// this is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that
// are not allowed in UTF-8;
return false;
}

if (remainingBytes == 2 && currentDecodeBits >= 0x110)
{
// this is going to be out of the upper Unicode bound 0x10FFFF.
return false;
}

sourceIndex = nextSourceIndex;
if (byteCount - remainingBytes == 2)
{
byte2 = nextByte;
}
else if (byteCount - remainingBytes == 3)
{
byte3 = nextByte;
}
else if (byteCount - remainingBytes == 4)
{
byte4 = nextByte;
}
}

if (currentDecodeBits < expectValueMin)
{
// overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
return false;
}

// all bytes are verified, write to the output
// TODO: measure later to determine if the performance of following logic can be improved
// the idea is to combine the bytes into short/int and write to span directly to avoid
// range check cost
if (byteCount > 0)
{
buffer[destinationIndex++] = (byte)byte1;
}
if (byteCount > 1)
{
buffer[destinationIndex++] = (byte)byte2;
}
if (byteCount > 2)
{
buffer[destinationIndex++] = (byte)byte3;
}
if (byteCount > 3)
{
buffer[destinationIndex++] = (byte)byte4;
}

return true;
}

private static void Copy(int begin, int end, ref int writer, Span<byte> buffer)
{
while (begin != end)
{
buffer[writer++] = buffer[begin++];
}
}

/// <summary>
/// Read the percent-encoding and try unescape it.
///
/// The operation first peek at the character the <paramref name="scan"/>
/// iterator points at. If it is % the <paramref name="scan"/> is then
/// moved on to scan the following to characters. If the two following
/// characters are hexadecimal literals they will be unescaped and the
/// value will be returned.
///
/// If the first character is not % the <paramref name="scan"/> iterator
/// will be removed beyond the location of % and -1 will be returned.
///
/// If the following two characters can't be successfully unescaped the
/// <paramref name="scan"/> iterator will be move behind the % and -1
/// will be returned.
/// </summary>
/// <param name="scan">The value to read</param>
/// <param name="buffer">The byte array</param>
/// <returns>The unescaped byte if success. Otherwise return -1.</returns>
private static int UnescapePercentEncoding(ref int scan, Span<byte> buffer)
{
if (buffer[scan++] != '%')
{
return -1;
}

var probe = scan;

var value1 = ReadHex(ref probe, buffer);
if (value1 == -1)
{
return -1;
}

var value2 = ReadHex(ref probe, buffer);
if (value2 == -1)
{
return -1;
}

if (SkipUnescape(value1, value2))
{
return -1;
}

scan = probe;
return (value1 << 4) + value2;
}


/// <summary>
/// Read the next char and convert it into hexadecimal value.
///
/// The <paramref name="scan"/> index will be moved to the next
/// byte no matter no matter whether the operation successes.
/// </summary>
/// <param name="scan">The index of the byte in the buffer to read</param>
/// <param name="buffer">The byte span from which the hex to be read</param>
/// <returns>The hexadecimal value if successes, otherwise -1.</returns>
private static int ReadHex(ref int scan, Span<byte> buffer)
{
if (scan == buffer.Length)
{
return -1;
}

var value = buffer[scan++];
var isHead = ((value >= '0') && (value <= '9')) ||
((value >= 'A') && (value <= 'F')) ||
((value >= 'a') && (value <= 'f'));

if (!isHead)
{
return -1;
}

if (value <= '9')
{
return value - '0';
}
else if (value <= 'F')
{
return (value - 'A') + 10;
}
else // a - f
{
return (value - 'a') + 10;
}
}

public static void Unescape(Span<byte> source, Span<byte> destination)
private static bool SkipUnescape(int value1, int value2)
{
// skip %2F - '/'
if (value1 == 2 && value2 == 15)
{
return true;
}

return false;
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("System.Text.Encodings.Web.Utf8.Tests")]
[assembly: AssemblyTrademark("")]

// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]

// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("d38cf672-795e-41d2-b10d-cfa87927bbdc")]
Loading

0 comments on commit 54e99ac

Please sign in to comment.