tsunami

log in
history

TIME parsing code

Luke Breuer
2008-01-13 05:29 UTC

As of January 12, 2008.

Consider: when this renders, I'm sending TIME code through the TIME rendering (markup -> HTML) engine. This is, umm, meta. In particular, TIME syntax allows for backslash escaping. When this escaping is applied to TIME code itself, weird things happen. The backslash escaping code is highlighted below; even it isn't correct, because it has been applied to itself.
#region Parsing
public string ParseMarkup()
{
    if (this.Data == null)
        return null;
    
    string html = HttpUtility.HtmlEncode(this.Data);

    // BUG: breaks if a [=]+ line has a URL reference in it
    html = ParseSeparatedHyperlinks(html);

    // whatever: -> bold
    //html = Regex.Replace(html, @"^\w[a-zA-Z0-9_- ]+:(?!//)", "<strong>$0</strong>", RegexOptions.Multiline);

    html = Regex.Replace(html, 
        @"(?:\r?\n|^)\ *&quot;(!)?((?:(?!&quot;$).)+)&quot;\ *(?:\r?\n|$)", 
        m => string.Format("<blockquote{0}>{1}</blockquote>", m.Groups[1].Success ? " class='important'" : "", m.Groups[2].Value),
        RegexOptions.IgnoreCase);

    // two linebreaks before/after bullet -> one linebreak
    html = Regex.Replace(html,
        @"(?<=\r\n)\r\n(?=- )|" +
        @"(?<=^(?:  )*- [^\r^\n]*)\r\n(?=\r\n)", ""); // (?<b>\r\n) taken out (right before the last \r\n)

    html = Regex.Replace(html,
        @"(?<!^(?:  )*[-=][^\n\r]*)\r\n(?={{)|(?<=}})\r\n(?!(?:  )*[-=])", "", RegexOptions.Multiline);

    html = ParseListMarkup(html);

    const int MaxHeading = 6;
    html = Regex.Replace(
        html, "(?:^|(?<=</ul>))(={1," + MaxHeading + @"})([^\r\n]+)(?=[\r\n]|$)(?:\r\n)?", 
        m => string.Format("<h{0}>{1}</h{0}>", MaxHeading - m.Groups[1].Length + 1, m.Groups[2].Value),
        RegexOptions.Multiline);

    // optional -> item url
    html = Regex.Replace(html, @"(?x)
        (?<!\)
        [[
        (?:
            (\w+://(?:[^/]+:\d+)?[^\]]+?) # http://a.b.com:800 is ambiguous: is the description 800, or is that the port?
            |
            ([a-zA-Z0-9][a-zA-Z0-9 -_#.:[\],]+)(/\d+)?
        )
        (?::([^:]*?))?
        \]\]", (Match m) =>
    {
        Group http = m.Groups[1];
        Group name = m.Groups[2];
        Group pk = m.Groups[3];
        Group display = m.Groups[4];

        return !http.Success
            ?
            Urls.Item(
                pk.Success
                ? int.Parse(pk.Value.Substring("/".Length))
                : -1,
            Time.Item.EncodeName(name.Value)).RenderHyperlink(
                display.Success
                    ? display.Value
                    : name.Success
                        ? name.Value
                        : pk.Value)
            :
            string.Format("<a href='{0}'>{1}</a>",
                http.Value,
                display.Success
                    ? display.Value
                    : http.Value);
    });

    // hyperlink-ize
    html = Regex.Replace(html,
        @"(?<!<[^>]+href=[""'])\b\w+:// (?: (?<p>() | [\w#~!@#$%&-+=:;,./?] | (?(p)(?<-p>))) )+ (?<![.,:;!])", 
        "<a href='$0'>$0</a>", RegexOptions.IgnorePatternWhitespace);
    // susceptible to <a href="http://www.breuer.com/">;; test </a>,
    // which has been turned into ..."breuer.com/&quot;&gt;;"...

    // emphasize
    html = Regex.Replace(html, @"(?<!\)(?<=\s|^|>)_((?:[^_<>]|\_)+?)(?<!\)_", "<em>$1</em>");
    // embolden
    html = Regex.Replace(html, @"(?<!\)(?<=\s|^|>)*((?:[^_<>]|\*)+?)(?<!\)*", "<strong>$1</strong>");

    // remove line break after }}  //html = html.Replace("}}\r\n", "}}");
    // linebreaks -> html linebreaks
    html = html.Replace("\n", "<br />");

    // single-line-code -> <code>single-line-code</code>
    // (?<!\)(\\)* returns an uneven number of backslashes
    html = Regex.Replace(html, @"
        (?<!
            (?<!\)\(?:\\)*
        )
        `
        (
            (?>
                (?<!\)\(?:\\)*`
                |
                [^`]
            )*
        )
        `", "<code>$1</code>", RegexOptions.IgnorePatternWhitespace);

    // {{code}} -> <pre>code</pre>
    html = Regex.Replace(html, @"(?x)
        (?<!\)
        {{
        (
            (?:
                [^}]
                |
                (?:
                    }(?!})
                    |
                    \(?:}})+
                )
            )*
        )
        (?<!\)
        }}     (?!})", (Match m) =>
    {
        // HACK: undo em & strong inside code
        string processed = Common.TabsToSpaces(m.Groups[1].Value.Replace("<br />", "\n")
            .Replace("<em>", "_")
            .Replace("</em>", "_")
            .Replace("<strong>", "*")
            .Replace("</strong>", "*"));

        int maxLen = Functional.Maximum(Functional.Map(
            processed.Split('\n'), s => Regex.Replace(Regex.Replace(s, @"&\w+;", " "), "<[^>]*>", "").TrimEnd().Length));

        processed = Regex.Replace(processed, @"
            ^!
            (
              ([ ])?     # if indentation is in spaces, we add one to replace the !
              [ \t]*     # capture this to put it outside the <code> tags
            )
            ([^\n\r]*)
            ([ \t]*)     # capture this to put it outside the <code> tags
            ", 
            "$2$1<code class='highlight'>$3</code>$4", 
            RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);

        // MAGIC: boo yah!
        const int MinLenForExplicitWidth = 81;
        const double WidthDivisor = 1.77;

        return string.Format("<pre{0}>{1}</pre>", 
            maxLen >= MinLenForExplicitWidth ? string.Format(" style='width:{0:0.00}em'", (maxLen + 0.8) / WidthDivisor) : "",
            processed);
    }, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);


    // remove backslashes in escapes
    html = Regex.Replace(html, @"(?x)
        !       (
            [[\`_*!]
            |
            \}\}
        )", "$1");

    return html;
}

private string ParseSeparatedHyperlinks(string html)
{
    // copied in part, with modifications, from ParseMarkup()
    const string DeclarationPattern = @"
        (?:\r\n)?          # optionally strip out the preceding newline
        ^([\d+\])\s*      # our reference ID (allow any trailing spaces)
        (
          \w+:// (?: (?<p>() | [\w#~!@#$%&-+=:;,./?] | (?(p)(?<-p>))) )+ (?<![.,:;!])
          |
          [a-zA-Z0-9][a-zA-Z0-9 -_#.:[\],]+(?:/\d+)?
        )
        (?:\r\n)?          # ditto the beginning";
    var declarations = Regex.Matches(html,
        DeclarationPattern, 
        RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);

    var decl = new Dictionary<string, List<Match>>();

    foreach (Match m in declarations)
    {
        List<Match> list;
        string key = m.Groups[1].Value;

        if (!decl.TryGetValue(key, out list))
            decl[key] = list = new List<Match>();

        list.Add(m);
    }

    foreach (var kvp in decl)
        kvp.Value.Sort((a, b) => a.Index.CompareTo(b.Index));

    string replaced = Regex.Replace(html, @"(&quot;(?:(?!&quot;)[^\r\n])+&quot;)([\d+\])",
        use =>
        {
            List<Match> list;
            string key = use.Groups[2].Value;

            if (decl.TryGetValue(key, out list))
            {
                Match dec = list.Find(m => m.Index > use.Index);
                int DelimiterLength = "&quot;".Length;
                Converter<string, string> strip = s => s.Substring(DelimiterLength, s.Length - DelimiterLength * 2);
                string text = strip(use.Groups[1].Value);
                string link = dec.Groups[2].Value;

                if (dec != null)
                    // '_' -> ' ' required for valid names (see Time.Item.EncodeName)
                    return string.Format("[[{0}:{1}]]", link.Contains("://") ? link : link.Replace('_', ' '), text);
            }

            return use.Value;
        });

    replaced = Regex.Replace(replaced, DeclarationPattern, "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);

    return replaced;
}

private static string ParseListMarkup(string html)
{
    int nestLevel = -1;
    StringBuilder sb = new StringBuilder();
    bool inCodeBlock = false;

    foreach (string line in (html + "\n").Split('\n'))
    {
        int spaces = 0;
        bool nestingIncreased = false;
        bool wasInCodeBlock = inCodeBlock;
        
        inCodeBlock |= Regex.IsMatch(line, @"(?<!\){{");
        inCodeBlock &= !line.Contains("}}");

        for (int i = 0; i < line.Length && line[i] == ' '; i++)
            spaces = i + 1;

        if (line.Length < spaces + "- ".Length || line.Substring(spaces, 2) != "- ")
            spaces = -2;

        spaces /= 2;

        // we're going up a nesting level
        if (spaces > nestLevel)
        {
            for (int i = 0; i < spaces - nestLevel; i++)
                sb.Append("<ul><li>");

            nestLevel += spaces - nestLevel;
            nestingIncreased = true;
        }
        // we're going down a nesting level
        else if (nestLevel >= 0 && spaces < nestLevel && !inCodeBlock && !wasInCodeBlock)
        {
            for (int i = 0; i < nestLevel - spaces; i++)
                sb.AppendFormat("</li></ul>");

            nestLevel -= nestLevel - spaces;
        }

        if (nestLevel >= 0 && !wasInCodeBlock)
        {
            // if nesting increased, we already emitted an <li>
            if (!nestingIncreased)
                sb.Append("</li><li>");

            sb.Append(line.Substring(spaces * 2 + "- ".Length));
        }
        else
        {
            sb.Append(line);
            sb.Append('\n');
        }
    }

    return sb.ToString();
}
#endregion