md2html.awk
changeset 15:fa51423d5292 tip
Back to the default branch
| author | yiyus@1936 |
|---|---|
| date | Tue Jul 21 17:25:19 2009 +0200 (2009-07-21 ago) |
| parents | 8d0cd6f66ee7 |
| children | |
| files | md2html.awk |
line diff
1.1 --- a/md2html.awk Thu Jun 04 17:37:05 2009 -0500
1.2 +++ b/md2html.awk Tue Jul 21 17:25:19 2009 +0200
1.3 @@ -1,176 +1,427 @@
1.4 #!/bin/awk -f
1.5 +#
1.6 +# by: Jesus Galan (yiyus) 2009
1.7 +#
1.8 +# Usage: md2html.awk file.md > file.html
1.9 +# See: http://4l77.com/src/md2html.awk
1.10
1.11 -# md2html.awk
1.12 -# by: Jesus Galan (yiyus) <yiyu.jgl@gmail>, May 2009
1.13 -# Usage:
1.14 -# md2html file.md > file.html
1.15 -# Options: -v esc=false to not escape html
1.16 -
1.17 -function newblock(nblock){
1.18 - if(text)
1.19 - print "<" block ">" text "</" block ">";
1.20 - text = "";
1.21 - block = nblock ? nblock : "p";
1.22 +function eschtml(t) {
1.23 + gsub("&", "\\&", t);
1.24 + gsub("<", "\\<", t);
1.25 + return t;
1.26 }
1.27
1.28 -function subinline(tgl, inl){
1.29 - while(match($0, tgl)){
1.30 - if (inline[ni] == inl)
1.31 - ni -= sub(tgl, "</" inl ">");
1.32 - else if (sub(tgl, "<" inl ">"))
1.33 - inline[++ni] = inl;
1.34 +function oprint(t){
1.35 + if(nr == 0)
1.36 + print t;
1.37 + else
1.38 + otext = otext "\n" t;
1.39 +}
1.40 +
1.41 +function subref(id){
1.42 + for(; nr > 0 && sub("<<" id, ref[id], otext); nr--);
1.43 + if(nr == 0 && otext) {
1.44 + print otext;
1.45 + otext = "";
1.46 }
1.47 }
1.48
1.49 -function dolink(href, lnk){
1.50 - # Undo escaped html in uris
1.51 - gsub(/&/, "\\&", href);
1.52 - gsub(/</, "<", href);
1.53 - gsub(/>/, ">", href);
1.54 - # & can be tricky, and not standard:
1.55 - gsub(/&/, "\\\\\\&", href);
1.56 - gsub(/&/, "\\\\\\&", lnk);
1.57 - return "<a href=\"" href "\">" lnk "</a>";
1.58 +function nextil(t) {
1.59 + if(!match(t, /[`<&\[*_\\-]|(\!\[)/))
1.60 + return t;
1.61 + t1 = substr(t, 1, RSTART - 1);
1.62 + tag = substr(t, RSTART, RLENGTH);
1.63 + t2 = substr(t, RSTART + RLENGTH);
1.64 + if(ilcode && tag != "`")
1.65 + return eschtml(t1 tag) nextil(t2);
1.66 + # Backslash escaping
1.67 + if(tag == "\\"){
1.68 + if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){
1.69 + tag = substr(t2, 1, 1);
1.70 + t2 = substr(t2, 2);
1.71 + }
1.72 + return t1 tag nextil(t2);
1.73 + }
1.74 + # Dashes
1.75 + if(tag == "-"){
1.76 + if(sub(/^-/, "", t2))
1.77 + tag = "—";
1.78 + return t1 tag nextil(t2);
1.79 + }
1.80 + # Inline Code
1.81 + if(tag == "`"){
1.82 + if(sub(/^`/, "", t2)){
1.83 + if(!match(t2, /``/))
1.84 + return t1 "”" nextil(t2);
1.85 + ilcode2 = !ilcode2;
1.86 + }
1.87 + else if(ilcode2)
1.88 + return t1 tag nextil(t2);
1.89 + tag = "<code>";
1.90 + if(ilcode){
1.91 + t1 = eschtml(t1);
1.92 + tag = "</code>";
1.93 + }
1.94 + ilcode = !ilcode;
1.95 + return t1 tag nextil(t2);
1.96 + }
1.97 + if(tag == "<"){
1.98 + # Autolinks
1.99 + if(match(t2, /^[^ ]+[\.@][^ ]+>/)){
1.100 + url = eschtml(substr(t2, 1, RLENGTH - 1));
1.101 + t2 = substr(t2, RLENGTH + 1);
1.102 + linktext = url;
1.103 + if(match(url, /@/) && !match(url, /^mailto:/))
1.104 + url = "mailto:" url;
1.105 + return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2);
1.106 + }
1.107 + # Html tags
1.108 + if(match(t2, /^[A-Za-z\/!][^>]*>/)){
1.109 + tag = tag substr(t2, RSTART, RLENGTH);
1.110 + t2 = substr(t2, RLENGTH + 1);
1.111 + return t1 tag nextil(t2);
1.112 + }
1.113 + return t1 "<" nextil(t2);
1.114 + }
1.115 + # Html special entities
1.116 + if(tag == "&"){
1.117 + if(match(t2, /^#?[A-Za-z0-9]+;/)){
1.118 + tag = tag substr(t2, RSTART, RLENGTH);
1.119 + t2 = substr(t2, RLENGTH + 1);
1.120 + return t1 tag nextil(t2);
1.121 + }
1.122 + return t1 "&" nextil(t2);
1.123 + }
1.124 + # Images
1.125 + if(tag == "!["){
1.126 + if(!match(t2, /(\[.*\])|(\(.*\))/))
1.127 + return t1 tag nextil(t2);
1.128 + match(t2, /^[^\]]*/);
1.129 + alt = substr(t2, 1, RLENGTH);
1.130 + t2 = substr(t2, RLENGTH + 2);
1.131 + if(match(t2, /^\(/)){
1.132 + # Inline
1.133 + sub(/^\(/, "", t2);
1.134 + match(t2, /^[^\)]+/);
1.135 + url = eschtml(substr(t2, 1, RLENGTH));
1.136 + t2 = substr(t2, RLENGTH + 2);
1.137 + title = "";
1.138 + if(match(url, /[ ]+\".*\"[ ]*$/)) {
1.139 + title = substr(url, RSTART, RLENGTH);
1.140 + url = substr(url, 1, RSTART - 1);
1.141 + match(title, /\".*\"/);
1.142 + title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
1.143 + }
1.144 + if(match(url, /^<.*>$/))
1.145 + url = substr(url, 2, RLENGTH - 2);
1.146 + return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2);
1.147 + }
1.148 + else{
1.149 + # Referenced
1.150 + sub(/^ ?\[/, "", t2);
1.151 + id = alt;
1.152 + if(match(t2, /^[^\]]+/))
1.153 + id = substr(t2, 1, RLENGTH);
1.154 + t2 = substr(t2, RLENGTH + 2);
1.155 + if(ref[id])
1.156 + r = ref[id];
1.157 + else{
1.158 + r = "<<" id;
1.159 + nr++;
1.160 + }
1.161 + return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2);
1.162 + }
1.163 + }
1.164 + # Links
1.165 + if(tag == "["){
1.166 + if(!match(t2, /(\[.*\])|(\(.*\))/))
1.167 + return t1 tag nextil(t2);
1.168 + match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/);
1.169 + linktext = substr(t2, 1, RLENGTH);
1.170 + t2 = substr(t2, RLENGTH + 2);
1.171 + if(match(t2, /^\(/)){
1.172 + # Inline
1.173 + match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/);
1.174 + url = substr(t2, 2, RLENGTH - 1);
1.175 + pt2 = substr(t2, RLENGTH + 2);
1.176 + title = "";
1.177 + if(match(url, /[ ]+\".*\"[ ]*$/)) {
1.178 + title = substr(url, RSTART, RLENGTH);
1.179 + url = substr(url, 1, RSTART - 1);
1.180 + match(title, /\".*\"/);
1.181 + title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
1.182 + }
1.183 + if(match(url, /^<.*>$/))
1.184 + url = substr(url, 2, RLENGTH - 2);
1.185 + url = eschtml(url);
1.186 + return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2);
1.187 + }
1.188 + else{
1.189 + # Referenced
1.190 + sub(/^ ?\[/, "", t2);
1.191 + id = linktext;
1.192 + if(match(t2, /^[^\]]+/))
1.193 + id = substr(t2, 1, RLENGTH);
1.194 + t2 = substr(t2, RLENGTH + 2);
1.195 + if(ref[id])
1.196 + r = ref[id];
1.197 + else{
1.198 + r = "<<" id;
1.199 + nr++;
1.200 + }
1.201 + pt2 = t2;
1.202 + return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2);
1.203 + }
1.204 + }
1.205 + # Emphasis
1.206 + if(match(tag, /[*_]/)){
1.207 + ntag = tag;
1.208 + if(sub("^" tag, "", t2)){
1.209 + if(stag[ns] == tag && match(t2, "^" tag))
1.210 + t2 = tag t2;
1.211 + else
1.212 + ntag = tag tag
1.213 + }
1.214 + n = length(ntag);
1.215 + tag = (n == 2) ? "strong" : "em";
1.216 + if(match(t1, / $/) && match(t2, /^ /))
1.217 + return t1 tag nextil(t2);
1.218 + if(stag[ns] == ntag){
1.219 + tag = "/" tag;
1.220 + ns--;
1.221 + }
1.222 + else
1.223 + stag[++ns] = ntag;
1.224 + tag = "<" tag ">";
1.225 + return t1 tag nextil(t2);
1.226 + }
1.227 +}
1.228 +
1.229 +function inline(t) {
1.230 + ilcode = 0;
1.231 + ilcode2 = 0;
1.232 + ns = 0;
1.233 +
1.234 + return nextil(t);
1.235 +}
1.236 +
1.237 +function printp(tag) {
1.238 + if(!match(text, /^[ ]*$/)){
1.239 + text = inline(text);
1.240 + if(tag != "")
1.241 + oprint("<" tag ">" text "</" tag ">");
1.242 + else
1.243 + oprint(text);
1.244 + }
1.245 + text = "";
1.246 }
1.247
1.248 BEGIN {
1.249 - ni = 0; # inlines
1.250 - nl = 0; # nested lists
1.251 + blank = 0;
1.252 + code = 0;
1.253 + hr = 0;
1.254 + html = 0;
1.255 + nl = 0;
1.256 + nr = 0;
1.257 + otext = "";
1.258 text = "";
1.259 - block = "p";
1.260 + par = "p";
1.261 }
1.262
1.263 -# Escape html
1.264 -esc != "false" {
1.265 - gsub("&", "\\&")
1.266 - gsub("<", "\\<")
1.267 - gsub(">", "\\>")
1.268 -}
1.269 +# References
1.270 +!code && /^ *\[[^\]]*\]:[ ]+/ {
1.271 + sub(/^ *\[/, "");
1.272 + match($0, /\]/);
1.273 + id = substr($0, 1, RSTART - 1);
1.274 + sub(id "\\]:[ ]+", "");
1.275 + title = "";
1.276 + if(match($0, /\".*\"$/))
1.277 + title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2);
1.278 + sub(/[ ]+\".*\"$/, "");
1.279 + url = eschtml($0);
1.280 + ref[id] = url title;
1.281
1.282 -# Horizontal rules (_ is not in markdown)
1.283 -/^[ ]*([-*_] ?)+[ ]*$/ && text == "" {
1.284 - print "<hr>";
1.285 + subref(id);
1.286 next;
1.287 }
1.288
1.289 -# Tables (not in markdown)
1.290 -# Syntax:
1.291 -# Right Align| Center Align |Left Align
1.292 -/([ ]\|)|(\|[ ])/ {
1.293 - if(block != "table")
1.294 - newblock("table");
1.295 - nc = split($0, cells, "|");
1.296 - $0 = "<tr>\n";
1.297 - for(i = 1; i <= nc; i++){
1.298 - align = "left";
1.299 - if(sub(/^[ ]+/, "", cells[i])){
1.300 - if(sub(/[ ]+$/, "", cells[i]))
1.301 - align = "center";
1.302 - else
1.303 - align = "right";
1.304 - }
1.305 - sub(/[ ]+$/,"", cells[i]);
1.306 - $0 = $0 "<td align=\"" align "\">" cells[i] "</td>\n";
1.307 - }
1.308 - $0 = $0 "</tr>";
1.309 +# html
1.310 +!html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
1.311 +isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ {
1.312 + if(code)
1.313 + oprint("</pre></code>");
1.314 + for(; !text && block[nl] == "blockquote"; nl--)
1.315 + oprint("</blockquote>");
1.316 + match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
1.317 + isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/);
1.318 + htag = substr($0, 2, RLENGTH - 1);
1.319 + if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)"))
1.320 + html = 1;
1.321 + if(html && match($0, /^<hr/))
1.322 + hr = 1;
1.323 + oprint($0);
1.324 + next;
1.325 }
1.326
1.327 -# Ordered and unordered (possibly nested) lists
1.328 -/^[ ]*([*+-]|(([0-9]+[\.-]?)+))[ ]/ {
1.329 - newblock("li");
1.330 - nnl = 1;
1.331 - while(match($0, /^[ ]/)){
1.332 - sub(/^[ ]/,"");
1.333 - nnl++;
1.334 - }
1.335 - while(nl > nnl)
1.336 - print "</" list[nl--] ">";
1.337 - while(nl < nnl){
1.338 - list[++nl] = "ol";
1.339 - if(match($0, /^[*+-]/))
1.340 - list[nl] = "ul";
1.341 - print "<" list[nl] ">";
1.342 - }
1.343 - sub(/^([*+-]|(([0-9]+[\.-]?)+))[ ]/,"");
1.344 +html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
1.345 +isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ ||
1.346 +(hr && />$/)) {
1.347 + html = 0;
1.348 + hr = 0;
1.349 + oprint($0);
1.350 + next;
1.351 }
1.352
1.353 -# Multi line list items
1.354 -block == "li" {
1.355 - sub(/^( *)|( *)/,"");
1.356 +html {
1.357 + oprint($0);
1.358 + next;
1.359 +}
1.360 +
1.361 +# List and quote blocks
1.362 +
1.363 +# Remove indentation
1.364 +{
1.365 + for(nnl = 0; nnl < nl; nnl++)
1.366 + if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \
1.367 + (block[nnl + 1] == "blockquote" && !sub(/^> ?/, "")))
1.368 + break;
1.369 +}
1.370 +nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; }
1.371 +# Quote blocks
1.372 +{
1.373 + while(sub(/^> /, ""))
1.374 + nblock[++nnl] = "blockquote";
1.375 +}
1.376 +# Horizontal rules
1.377 +{ hr = 0; }
1.378 +(blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ {
1.379 + if(code){
1.380 + oprint("</pre></code>");
1.381 + code = 0;
1.382 + }
1.383 + blank = 0;
1.384 + nnl = 0;
1.385 + hr = 1;
1.386 +}
1.387 +# List items
1.388 +block[nl] ~ /[ou]l/ && /^$/ {
1.389 + blank = 1;
1.390 + next;
1.391 +}
1.392 +{ newli = 0; }
1.393 +!hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ {
1.394 + sub(/^ ? ? ?[*+-]( +| )/, "");
1.395 + nnl++;
1.396 + nblock[nnl] = "ul";
1.397 + newli = 1;
1.398 +}
1.399 +(nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ {
1.400 + sub(/^ ? ? ?([0-9]+\.)+( +| )/, "");
1.401 + nnl++;
1.402 + nblock[nnl] = "ol";
1.403 + newli = 1;
1.404 +}
1.405 +newli {
1.406 + if(blank && nnl == nl && !par)
1.407 + par = "p";
1.408 + blank = 0;
1.409 + printp(par);
1.410 + if(nnl == nl && block[nl] == nblock[nl])
1.411 + oprint("</li><li>");
1.412 +}
1.413 +blank && ! /^$/ {
1.414 + if(match(block[nnl], /[ou]l/) && !par)
1.415 + par = "p";
1.416 + printp(par);
1.417 + par = "p";
1.418 + blank = 0;
1.419 +}
1.420 +
1.421 +# Close old blocks and open new ones
1.422 +nnl != nl || nblock[nl] != block[nl] {
1.423 + if(code){
1.424 + oprint("</pre></code>");
1.425 + code = 0;
1.426 + }
1.427 + printp(par);
1.428 + b = (nnl > nl) ? nblock[nnl] : block[nnl];
1.429 + par = (match(b, /[ou]l/)) ? "" : "p";
1.430 +}
1.431 +nnl < nl || (nnl == nl && nblock[nl] != block[nl]) {
1.432 + for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){
1.433 + if(match(block[nl], /[ou]l/))
1.434 + oprint("</li>");
1.435 + oprint("</" block[nl] ">");
1.436 + }
1.437 +}
1.438 +nnl > nl {
1.439 + for(; nl < nnl; nl++){
1.440 + block[nl + 1] = nblock[nl + 1];
1.441 + oprint("<" block[nl + 1] ">");
1.442 + if(match(block[nl + 1], /[ou]l/))
1.443 + oprint("<li>");
1.444 + }
1.445 +}
1.446 +hr {
1.447 + oprint("<hr>");
1.448 + next;
1.449 }
1.450
1.451 # Code blocks
1.452 -/^( | )/ {
1.453 - if(block != "pre")
1.454 - newblock("pre");
1.455 - sub(/^( | )/, "");
1.456 - text = text $0 "\n";
1.457 +code && /^$/ {
1.458 + if(blanK)
1.459 + oprint("");
1.460 + blank = 1;
1.461 + next;
1.462 +}
1.463 +!text && sub(/^( | )/, "") {
1.464 + if(blanK)
1.465 + oprint("");
1.466 + blank = 0;
1.467 + if(!code)
1.468 + oprint("<code><pre>");
1.469 + code = 1;
1.470 + $0 = eschtml($0);
1.471 + oprint($0);
1.472 + next;
1.473 +}
1.474 +code {
1.475 + oprint("</pre></code>");
1.476 + code = 0;
1.477 +}
1.478 +
1.479 +# Setex-style Headers
1.480 +text && /^=+$/ {printp("h1"); next;}
1.481 +text && /^-+$/ {printp("h2"); next;}
1.482 +
1.483 +# Atx-Style headers
1.484 +/^#+/ && (!newli || par=="p" || /^##/) {
1.485 + for(n = 0; n < 6 && sub(/^# */, ""); n++)
1.486 + sub(/#$/, "");
1.487 + par = "h" n;
1.488 +}
1.489 +
1.490 +# Paragraph
1.491 +/^$/ {
1.492 + printp(par);
1.493 + par = "p";
1.494 next;
1.495 }
1.496
1.497 -# Paragraph
1.498 -/^$/ {
1.499 - newblock();
1.500 - while(nl > 0)
1.501 - print "</" list[nl--] ">";
1.502 -}
1.503 -
1.504 -# Setex-style Headers
1.505 -# (Plus h3 with underscores.)
1.506 -/^=+$/ {
1.507 - block = "h" 1;
1.508 - next;
1.509 -}
1.510 -
1.511 -/^-+$/ {
1.512 - block = "h" 2;
1.513 - next;
1.514 -}
1.515 -
1.516 -/^_+$/ {
1.517 - block = "h" 3;
1.518 - next;
1.519 -}
1.520 -
1.521 -# Atx-style headers
1.522 -/^#/ {
1.523 - newblock();
1.524 - match($0, /#+/);
1.525 - n = RLENGTH;
1.526 - if(n > 6)
1.527 - n = 6;
1.528 - text = substr($0, RLENGTH + 1);
1.529 - block = "h" n;
1.530 - next;
1.531 -}
1.532 -
1.533 -// {
1.534 - # Images
1.535 - while(match($0, /!\[[^\]]+\]\([^\)]+\)/)){
1.536 - split(substr($0, RSTART + 2, RLENGTH - 3), a, /\]\(/);
1.537 - sub(/!\[[^\]]+\]\([^\)]+\)/, "<img src=\"" a[2] "\" alt=\"" a[1] "\">");
1.538 - }
1.539 - # Links
1.540 - while(match($0, /\[[^\]]+\]\([^\)]+\)/)){
1.541 - split(substr($0, RSTART + 1, RLENGTH - 2), a, /\]\(/);
1.542 - sub(/\[[^\]]+\]\([^\)]+\)/, dolink(a[2], a[1]));
1.543 - }
1.544 - # Auto links (uri matching is poor)
1.545 - na = split($0, a, /(^\()|[ ]|([,\.\)]([ ]|$))/);
1.546 - for(i = 1; i <= na; i++)
1.547 - if(match(a[i], /^(((https?|ftp|file|news|irc):\/\/)|(mailto:)).+$/))
1.548 - sub(a[i], dolink(a[i], a[i]));
1.549 - # Inline
1.550 - subinline("(\\*\\*)|(__)", "strong");
1.551 - subinline("\\*", "em");
1.552 - subinline("`", "code");
1.553 - text = text (text ? " " : "") $0;
1.554 -}
1.555 +# Add text
1.556 +{ text = (text ? text " " : "") $0; }
1.557
1.558 END {
1.559 - while(ni > 0)
1.560 - text = text "</" inline[ni--] ">";
1.561 - newblock();
1.562 - while(nl > 0)
1.563 - print "</" list[nl--] ">";
1.564 + if(code){
1.565 + oprint("</pre></code>");
1.566 + code = 0;
1.567 + }
1.568 + printp(par);
1.569 + for(; nl > 0; nl--){
1.570 + if(match(block[nl], /[ou]l/))
1.571 + oprint("</li>");
1.572 + oprint("</" block[nl] ">");
1.573 + }
1.574 + gsub(/<<[^\"]*/, "", otext);
1.575 + print(otext);
1.576 }