Friday, 8 November 2013

Umbraco truncate vs Regex


Comparison of regex removal of html tags and using memory stream for the same impression.

Compiled regex is faster.



  class Program
    {
        static void Main(string[] args)
        {

            int repeats = 1;
            
            
            Regex regex = new Regex(@"<(.|\n)*?>",RegexOptions.Compiled);
            


            var htmlStr = @"
<div id='lipsum'>
<p>
</p><ul>
<li>Lorem ipsum dolor sit amet, consectetur adipiscing elit.</li>
<li>Vestibulum tincidunt eros ac velit scelerisque pharetra.</li>
</ul>
<p></p>
<p>
</p><ul>
<li>Aenean pellentesque mauris et massa eleifend tristique.</li>
<li>In rutrum magna at arcu molestie porta.</li>
<li>Pellentesque rutrum nibh non est auctor varius.</li>
</ul>
<p></p>
<p>
</p><ul>
<li>Vestibulum elementum ante blandit risus cursus convallis.</li>
<li>In id magna lacinia, luctus sapien in, tempor elit.</li>
</ul>
<p></p>
<p>
</p><ul>
<li>Integer tempus tellus nec purus ultrices, quis facilisis magna adipiscing.</li>
<li>Cras convallis sapien vel augue ultrices pulvinar.</li>
<li>Sed pretium eros vel tellus feugiat, ut congue erat pellentesque.</li>
<li>Vestibulum convallis tortor congue sapien condimentum, sit amet vestibulum nisl tristique.</li>
<li>Cras non lacus sagittis, auctor massa eget, pulvinar elit.</li>
<li>Morbi eu augue tincidunt, luctus lacus vestibulum, varius elit.</li>
</ul>
<p></p>
<p>
</p><ul>
<li>Cras at dui sed justo convallis mattis.</li>
<li>Curabitur molestie mi nec dui interdum aliquet.</li>
<li>Maecenas malesuada magna non aliquam sollicitudin.</li>
<li>Maecenas porta erat quis turpis dictum faucibus.</li>
<li>Duis dignissim elit a ultrices tristique.</li>
</ul>
<p></p></div>";

            //var timer1 = new Stopwatch();
            //Console.WriteLine("Not compiled");
            //timer1.Start();

            //for (int i = 0; i < repeats; i++)
            //{
            //     Do.StripHtmlFromString(htmlStr);
            //}
            //timer1.Stop();

            var timer1 = new Stopwatch();
            Console.WriteLine("Inhouse");
            timer1.Start();

            for (int i = 0; i < repeats; i++)
            {
                Do.DisplaySearchSummary(htmlStr, 150, regex);
            }
            timer1.Stop();

            Console.WriteLine("Inhouse: " + timer1.ElapsedTicks);

            var timer2 = new Stopwatch();
            Console.WriteLine("umbraco");
            timer2.Start();

            for (int i = 0; i < repeats; i++)
            {
                Do.Truncate(htmlStr, 150, true, false);
            }
            timer2.Stop();

            Console.WriteLine("Umbraco: " + timer2.ElapsedTicks);


            Console.ReadKey();
            Console.ReadKey();
        }
    }

    public class Do
    {
        

        /// <summary>
        /// Strips the HTML from string.
        /// </summary>
        /// <param name="htmlString">The HTML string.</param>
        /// <returns>
        /// String without html tags.
        /// </returns>
        public static string StripHtmlFromString(string htmlString, Regex regex)
        {
            if (string.IsNullOrEmpty(htmlString)) return htmlString;
            
            return regex.Replace(htmlString,string.Empty);
        }


        public static string DisplaySearchSummary(string descriptionText, int requiredLength, Regex regex, string ending = "...")
        {
            descriptionText = StripHtmlFromString(descriptionText, regex);

            if (!string.IsNullOrEmpty(descriptionText) && descriptionText.Length >= requiredLength)
            {
                // Get the required requiredLength of the string
                var requiredtext = descriptionText.Substring(0, requiredLength - 1);

                // Select the last occurence of a whitespace character to break the string correctly
                var ouputtext = string.Concat(requiredtext.Substring(0, requiredtext.LastIndexOf(' ')), ending);

                return ouputtext;
            }

            return descriptionText;
        }
      
        
        public static string Truncate(string html, int length, bool addElipsis, bool treatTagsAsContent)
        {
            using (var outputms = new MemoryStream())
            {
                using (var outputtw = new StreamWriter(outputms))
                {
                    using (var ms = new MemoryStream())
                    {
                        using (var tw = new StreamWriter(ms))
                        {
                            tw.Write(html);
                            tw.Flush();
                            ms.Position = 0;
                            var tagStack = new Stack<string>();
                            using (TextReader tr = new StreamReader(ms))
                            {
                                bool IsInsideElement = false;
                                bool lengthReached = false;
                                int ic = 0;
                                int currentLength = 0, currentTextLength = 0;
                                string currentTag = string.Empty;
                                string tagContents = string.Empty;
                                bool insideTagSpaceEncountered = false;
                                bool isTagClose = false;
                                while ((ic = tr.Read()) != -1)
                                {
                                    bool write = true;

                                    if (ic == (int)'<')
                                    {
                                        if (!lengthReached)
                                        {
                                            IsInsideElement = true;
                                        }
                                        insideTagSpaceEncountered = false;
                                        currentTag = string.Empty;
                                        tagContents = string.Empty;
                                        isTagClose = false;
                                        if (tr.Peek() == (int)'/')
                                        {
                                            isTagClose = true;
                                        }
                                    }
                                    else if (ic == (int)'>')
                                    {
                                        //if (IsInsideElement)
                                        //{
                                        IsInsideElement = false;
                                        //if (write)
                                        //{
                                        //  outputtw.Write('>');
                                        //}
                                        currentTextLength++;
                                        if (isTagClose && tagStack.Count > 0)
                                        {
                                            string thisTag = tagStack.Pop();
                                            outputtw.Write("</" + thisTag + ">");
                                        }
                                        if (!isTagClose && currentTag.Length > 0)
                                        {
                                            if (!lengthReached)
                                            {
                                                tagStack.Push(currentTag);
                                                outputtw.Write("<" + currentTag);
                                                if (tr.Peek() != (int)' ')
                                                {
                                                    if (!string.IsNullOrEmpty(tagContents))
                                                    {
                                                        if (tagContents.EndsWith("/"))
                                                        {
                                                            //short close
                                                            tagStack.Pop();
                                                        }
                                                        outputtw.Write(tagContents);
                                                    }
                                                    outputtw.Write(">");
                                                }
                                            }
                                        }
                                        //}
                                        continue;
                                    }
                                    else
                                    {
                                        if (IsInsideElement)
                                        {
                                            if (ic == (int)' ')
                                            {
                                                if (!insideTagSpaceEncountered)
                                                {
                                                    insideTagSpaceEncountered = true;
                                                    //if (!isTagClose)
                                                    //{
                                                    // tagStack.Push(currentTag);
                                                    //}
                                                }
                                            }
                                            if (!insideTagSpaceEncountered)
                                            {
                                                currentTag += (char)ic;
                                            }
                                        }
                                    }
                                    if (IsInsideElement || insideTagSpaceEncountered)
                                    {
                                        write = false;
                                        if (insideTagSpaceEncountered)
                                        {
                                            tagContents += (char)ic;
                                        }
                                    }
                                    if (!IsInsideElement || treatTagsAsContent)
                                    {
                                        currentTextLength++;
                                    }
                                    currentLength++;
                                    if (currentTextLength <= length || (lengthReached && IsInsideElement))
                                    {
                                        if (write)
                                        {
                                            outputtw.Write((char)ic);
                                        }
                                    }
                                    if (!lengthReached && currentTextLength >= length)
                                    {
                                        //reached truncate point
                                        if (addElipsis)
                                        {
                                            outputtw.Write("&hellip;");
                                        }
                                        lengthReached = true;
                                    }

                                }

                            }
                        }
                    }
                    outputtw.Flush();
                    outputms.Position = 0;
                    using (TextReader outputtr = new StreamReader(outputms))
                    {
                        return outputtr.ReadToEnd().Replace("  ", " ").Trim();
                    }
                }
            }
        }
    }