Add UrlEncoder for decode Url Path

The UrlEncoder provides two methods to decode Url path. One decode the raw path in place and the other save output to the given byte span. The decoder will unescapse the path string.
davidpadbury · Oct 4, 2016 · 54e99ac · 54e99ac
1 parent 9b3e31c
commit 54e99ac
Show file tree

Hide file tree

Showing 8 changed files with 537 additions and 8 deletions.
diff --git a/corefxlab.sln b/corefxlab.sln
@@ -107,9 +107,13 @@ Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Collections.Sequence
 EndProject
 Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Collections.Sequences.Tests", "tests\System.Collections.Sequences.Tests\System.Collections.Sequences.Tests.xproj", "{8EC131AE-D04F-4B7D-B223-D82821D8AFCC}"
 EndProject
+<<<<<<< 9b3e31c0a9209564c78700d5120dfad2690b1b86
 Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Text.Json.Dynamic", "src\System.Text.Json.Dynamic\System.Text.Json.Dynamic.xproj", "{BB6D79C1-783F-4B87-A281-5EAB22CA7BF0}"
 EndProject
 Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Text.Json.Dynamic.Tests", "tests\System.Text.Json.Tests.Dynamic\System.Text.Json.Dynamic.Tests.xproj", "{B6FBB81F-241F-4603-9510-E269F843F6CB}"
+=======
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "System.Text.Encodings.Web.Utf8.Tests", "tests\System.Text.Encodings.Web.Utf8.Tests\System.Text.Encodings.Web.Utf8.Tests.xproj", "{D38CF672-795E-41D2-B10D-CFA87927BBDC}"
+>>>>>>> Add UrlEncoder.UnescapeInPlace
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -309,6 +313,10 @@ Global
 		{B6FBB81F-241F-4603-9510-E269F843F6CB}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{B6FBB81F-241F-4603-9510-E269F843F6CB}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{B6FBB81F-241F-4603-9510-E269F843F6CB}.Release|Any CPU.Build.0 = Release|Any CPU
+		{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{D38CF672-795E-41D2-B10D-CFA87927BBDC}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -362,5 +370,6 @@ Global
 		{8EC131AE-D04F-4B7D-B223-D82821D8AFCC} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
 		{BB6D79C1-783F-4B87-A281-5EAB22CA7BF0} = {4B000021-5278-4F2A-B734-DE49F55D4024}
 		{B6FBB81F-241F-4603-9510-E269F843F6CB} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
+		{D38CF672-795E-41D2-B10D-CFA87927BBDC} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
 	EndGlobalSection
 EndGlobal
diff --git a/src/System.Text.Encodings.Web.Utf8/UrlEncoder.cs b/src/System.Text.Encodings.Web.Utf8/UrlEncoder.cs
@@ -1,19 +1,330 @@
-using System;
-using System.Collections.Generic;
-using System.Threading.Tasks;
-
-namespace System.Text.Encodings.Web.Utf8
+namespace System.Text.Encodings.Web.Utf8
 {
     public class UrlEncoder
     {
-        public static void UnescapeInPlace(Span<byte> data)
+        /// <summary>
+        /// Unescape a URL path
+        /// </summary>
+        /// <param name="source">The byte span represents a UTF8 encoding url path.</param>
+        /// <param name="destination">The byte span where unescaped url path is copied to.</param>
+        /// <returns>The length of the byte sequence of the unescaped url path.</returns>
+        public static int Decode(ReadOnlySpan<byte> source, Span<byte> destination)
+        {
+            if (destination.Length < source.Length)
+            {
+                throw new ArgumentException(
+                    "Lenghth of the destination byte span is less then the source.",
+                    nameof(destination));
+            }
+
+            // This requires the destination span to be larger or equal to source span
+            source.CopyTo(destination);
+            return DecodeInPlace(destination);
+        }
+
+        /// <summary>
+        /// Unescape a URL path in place.
+        /// </summary>
+        /// <param name="buffer">The byte span represents a UTF8 encoding url path.</param>
+        /// <returns>The number of the bytes representing the result.</returns>
+        /// <remarks>
+        /// The unescape is done in place, which means after decoding the result is the subset of 
+        /// the input span.
+        /// </remarks>
+        public static int DecodeInPlace(Span<byte> buffer)
         {
+            // the slot to read the input
+            var sourceIndex = 0;
+
+            // the slot to write the unescaped byte
+            var destinationIndex = 0;
+
+            while (true)
+            {
+                if (sourceIndex == buffer.Length)
+                {
+                    break;
+                }
+
+                if (buffer[sourceIndex] == '%')
+                {
+                    var decodeIndex = sourceIndex;
+
+                    // If decoding process succeeds, the writer iterator will be moved
+                    // to the next write-ready location. On the other hand if the scanned
+                    // percent-encodings cannot be interpreted as sequence of UTF-8 octets,
+                    // these bytes should be copied to output as is. 
+                    // The decodeReader iterator is always moved to the first byte not yet 
+                    // be scanned after the process. A failed decoding means the chars
+                    // between the reader and decodeReader can be copied to output untouched. 
+                    if (!DecodeCore(ref decodeIndex, ref destinationIndex, buffer))
+                    {
+                        Copy(sourceIndex, decodeIndex, ref destinationIndex, buffer);
+                    }
+
+                    sourceIndex = decodeIndex;
+                }
+                else
+                {
+                    buffer[destinationIndex++] = buffer[sourceIndex++];
+                }
+            }
+
+            return destinationIndex;
+        }
+
+        /// <summary>
+        /// Unescape the percent-encodings
+        /// </summary>
+        /// <param name="sourceIndex">The iterator point to the first % char</param>
+        /// <param name="destinationIndex">The place to write to</param>
+        /// <param name="end">The end of the buffer</param>
+        /// <param name="buffer">The byte array</param>
+        private static bool DecodeCore(ref int sourceIndex, ref int destinationIndex, Span<byte> buffer)
+        {
+            // preserves the original head. if the percent-encodings cannot be interpreted as sequence of UTF-8 octets,
+            // bytes from this till the last scanned one will be copied to the memory pointed by writer.
+            var byte1 = UnescapePercentEncoding(ref sourceIndex, buffer);
+            if (byte1 == -1)
+            {
+                return false;
+            }
+
+            if (byte1 == 0)
+            {
+                throw new InvalidOperationException("The path contains null characters.");
+            }
+
+            if (byte1 <= 0x7F)
+            {
+                // first byte < U+007f, it is a single byte ASCII
+                buffer[destinationIndex++] = (byte)byte1;
+                return true;
+            }
+
+            int byte2 = 0, byte3 = 0, byte4 = 0;
+
+            // anticipate more bytes
+            var currentDecodeBits = 0;
+            var byteCount = 1;
+            var expectValueMin = 0;
+            if ((byte1 & 0xE0) == 0xC0)
+            {
+                // 110x xxxx, expect one more byte
+                currentDecodeBits = byte1 & 0x1F;
+                byteCount = 2;
+                expectValueMin = 0x80;
+            }
+            else if ((byte1 & 0xF0) == 0xE0)
+            {
+                // 1110 xxxx, expect two more bytes
+                currentDecodeBits = byte1 & 0x0F;
+                byteCount = 3;
+                expectValueMin = 0x800;
+            }
+            else if ((byte1 & 0xF8) == 0xF0)
+            {
+                // 1111 0xxx, expect three more bytes
+                currentDecodeBits = byte1 & 0x07;
+                byteCount = 4;
+                expectValueMin = 0x10000;
+            }
+            else
+            {
+                // invalid first byte
+                return false;
+            }
+
+            var remainingBytes = byteCount - 1;
+            while (remainingBytes > 0)
+            {
+                // read following three chars
+                if (sourceIndex == buffer.Length)
+                {
+                    return false;
+                }
+
+                var nextSourceIndex = sourceIndex;
+                var nextByte = UnescapePercentEncoding(ref nextSourceIndex, buffer);
+                if (nextByte == -1)
+                {
+                    return false;
+                }
+
+                if ((nextByte & 0xC0) != 0x80)
+                {
+                    // the follow up byte is not in form of 10xx xxxx
+                    return false;
+                }
+
+                currentDecodeBits = (currentDecodeBits << 6) | (nextByte & 0x3F);
+                remainingBytes--;
+
+                if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F)
+                {
+                    // this is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that
+                    // are not allowed in UTF-8;
+                    return false;
+                }
+
+                if (remainingBytes == 2 && currentDecodeBits >= 0x110)
+                {
+                    // this is going to be out of the upper Unicode bound 0x10FFFF.
+                    return false;
+                }
+
+                sourceIndex = nextSourceIndex;
+                if (byteCount - remainingBytes == 2)
+                {
+                    byte2 = nextByte;
+                }
+                else if (byteCount - remainingBytes == 3)
+                {
+                    byte3 = nextByte;
+                }
+                else if (byteCount - remainingBytes == 4)
+                {
+                    byte4 = nextByte;
+                }
+            }
+
+            if (currentDecodeBits < expectValueMin)
+            {
+                // overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
+                return false;
+            }
+
+            // all bytes are verified, write to the output
+            // TODO: measure later to determine if the performance of following logic can be improved
+            //       the idea is to combine the bytes into short/int and write to span directly to avoid
+            //       range check cost
+            if (byteCount > 0)
+            {
+                buffer[destinationIndex++] = (byte)byte1;
+            }
+            if (byteCount > 1)
+            {
+                buffer[destinationIndex++] = (byte)byte2;
+            }
+            if (byteCount > 2)
+            {
+                buffer[destinationIndex++] = (byte)byte3;
+            }
+            if (byteCount > 3)
+            {
+                buffer[destinationIndex++] = (byte)byte4;
+            }
+
+            return true;
+        }
+
+        private static void Copy(int begin, int end, ref int writer, Span<byte> buffer)
+        {
+            while (begin != end)
+            {
+                buffer[writer++] = buffer[begin++];
+            }
+        }
+
+        /// <summary>
+        /// Read the percent-encoding and try unescape it.
+        /// 
+        /// The operation first peek at the character the <paramref name="scan"/> 
+        /// iterator points at. If it is % the <paramref name="scan"/> is then 
+        /// moved on to scan the following to characters. If the two following 
+        /// characters are hexadecimal literals they will be unescaped and the 
+        /// value will be returned.
+        /// 
+        /// If the first character is not % the <paramref name="scan"/> iterator 
+        /// will be removed beyond the location of % and -1 will be returned.
+        /// 
+        /// If the following two characters can't be successfully unescaped the 
+        /// <paramref name="scan"/> iterator will be move behind the % and -1 
+        /// will be returned.
+        /// </summary>
+        /// <param name="scan">The value to read</param>
+        /// <param name="buffer">The byte array</param>
+        /// <returns>The unescaped byte if success. Otherwise return -1.</returns>
+        private static int UnescapePercentEncoding(ref int scan, Span<byte> buffer)
+        {
+            if (buffer[scan++] != '%')
+            {
+                return -1;
+            }
+
+            var probe = scan;
+
+            var value1 = ReadHex(ref probe, buffer);
+            if (value1 == -1)
+            {
+                return -1;
+            }
+
+            var value2 = ReadHex(ref probe, buffer);
+            if (value2 == -1)
+            {
+                return -1;
+            }
+
+            if (SkipUnescape(value1, value2))
+            {
+                return -1;
+            }
+
+            scan = probe;
+            return (value1 << 4) + value2;
+        }
+
+
+        /// <summary>
+        /// Read the next char and convert it into hexadecimal value.
+        /// 
+        /// The <paramref name="scan"/> index will be moved to the next
+        /// byte no matter no matter whether the operation successes.
+        /// </summary>
+        /// <param name="scan">The index of the byte in the buffer to read</param>
+        /// <param name="buffer">The byte span from which the hex to be read</param>
+        /// <returns>The hexadecimal value if successes, otherwise -1.</returns>
+        private static int ReadHex(ref int scan, Span<byte> buffer)
+        {
+            if (scan == buffer.Length)
+            {
+                return -1;
+            }
+
+            var value = buffer[scan++];
+            var isHead = ((value >= '0') && (value <= '9')) ||
+                         ((value >= 'A') && (value <= 'F')) ||
+                         ((value >= 'a') && (value <= 'f'));
+
+            if (!isHead)
+            {
+                return -1;
+            }
 
+            if (value <= '9')
+            {
+                return value - '0';
+            }
+            else if (value <= 'F')
+            {
+                return (value - 'A') + 10;
+            }
+            else // a - f
+            {
+                return (value - 'a') + 10;
+            }
         }
 
-        public static void Unescape(Span<byte> source, Span<byte> destination)
+        private static bool SkipUnescape(int value1, int value2)
         {
+            // skip %2F - '/'
+            if (value1 == 2 && value2 == 15)
+            {
+                return true;
+            }
 
+            return false;
         }
     }
-}
+}
diff --git a/tests/System.Text.Encodings.Web.Utf8.Tests/Properties/AssemblyInfo.cs b/tests/System.Text.Encodings.Web.Utf8.Tests/Properties/AssemblyInfo.cs
@@ -0,0 +1,19 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("System.Text.Encodings.Web.Utf8.Tests")]
+[assembly: AssemblyTrademark("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components.  If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("d38cf672-795e-41d2-b10d-cfa87927bbdc")]