md2html.awk

log tags | changeset raw browse | bz2 | zip | gz

changeset 15:fa51423d5292 tip

Back to the default branch
author yiyus@1936
date Tue Jul 21 17:25:19 2009 +0200 (2009-07-21 ago)
parents 8d0cd6f66ee7
children
files md2html.awk
line diff
1.1 --- a/md2html.awk Thu Jun 04 17:37:05 2009 -0500 1.2 +++ b/md2html.awk Tue Jul 21 17:25:19 2009 +0200 1.3 @@ -1,176 +1,427 @@ 1.4 #!/bin/awk -f 1.5 +# 1.6 +# by: Jesus Galan (yiyus) 2009 1.7 +# 1.8 +# Usage: md2html.awk file.md > file.html 1.9 +# See: http://4l77.com/src/md2html.awk 1.10 1.11 -# md2html.awk 1.12 -# by: Jesus Galan (yiyus) <yiyu.jgl@gmail>, May 2009 1.13 -# Usage: 1.14 -# md2html file.md > file.html 1.15 -# Options: -v esc=false to not escape html 1.16 - 1.17 -function newblock(nblock){ 1.18 - if(text) 1.19 - print "<" block ">" text "</" block ">"; 1.20 - text = ""; 1.21 - block = nblock ? nblock : "p"; 1.22 +function eschtml(t) { 1.23 + gsub("&", "\\&amp;", t); 1.24 + gsub("<", "\\&lt;", t); 1.25 + return t; 1.26 } 1.27 1.28 -function subinline(tgl, inl){ 1.29 - while(match($0, tgl)){ 1.30 - if (inline[ni] == inl) 1.31 - ni -= sub(tgl, "</" inl ">"); 1.32 - else if (sub(tgl, "<" inl ">")) 1.33 - inline[++ni] = inl; 1.34 +function oprint(t){ 1.35 + if(nr == 0) 1.36 + print t; 1.37 + else 1.38 + otext = otext "\n" t; 1.39 +} 1.40 + 1.41 +function subref(id){ 1.42 + for(; nr > 0 && sub("<<" id, ref[id], otext); nr--); 1.43 + if(nr == 0 && otext) { 1.44 + print otext; 1.45 + otext = ""; 1.46 } 1.47 } 1.48 1.49 -function dolink(href, lnk){ 1.50 - # Undo escaped html in uris 1.51 - gsub(/&amp;/, "\\&", href); 1.52 - gsub(/&lt;/, "<", href); 1.53 - gsub(/&gt;/, ">", href); 1.54 - # & can be tricky, and not standard: 1.55 - gsub(/&/, "\\\\\\&", href); 1.56 - gsub(/&/, "\\\\\\&", lnk); 1.57 - return "<a href=\"" href "\">" lnk "</a>"; 1.58 +function nextil(t) { 1.59 + if(!match(t, /[`<&\[*_\\-]|(\!\[)/)) 1.60 + return t; 1.61 + t1 = substr(t, 1, RSTART - 1); 1.62 + tag = substr(t, RSTART, RLENGTH); 1.63 + t2 = substr(t, RSTART + RLENGTH); 1.64 + if(ilcode && tag != "`") 1.65 + return eschtml(t1 tag) nextil(t2); 1.66 + # Backslash escaping 1.67 + if(tag == "\\"){ 1.68 + if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){ 1.69 + tag = substr(t2, 1, 1); 1.70 + t2 = substr(t2, 2); 1.71 + } 1.72 + return t1 tag nextil(t2); 1.73 + } 1.74 + # Dashes 1.75 + if(tag == "-"){ 1.76 + if(sub(/^-/, "", t2)) 1.77 + tag = "&#8212;"; 1.78 + return t1 tag nextil(t2); 1.79 + } 1.80 + # Inline Code 1.81 + if(tag == "`"){ 1.82 + if(sub(/^`/, "", t2)){ 1.83 + if(!match(t2, /``/)) 1.84 + return t1 "&#8221;" nextil(t2); 1.85 + ilcode2 = !ilcode2; 1.86 + } 1.87 + else if(ilcode2) 1.88 + return t1 tag nextil(t2); 1.89 + tag = "<code>"; 1.90 + if(ilcode){ 1.91 + t1 = eschtml(t1); 1.92 + tag = "</code>"; 1.93 + } 1.94 + ilcode = !ilcode; 1.95 + return t1 tag nextil(t2); 1.96 + } 1.97 + if(tag == "<"){ 1.98 + # Autolinks 1.99 + if(match(t2, /^[^ ]+[\.@][^ ]+>/)){ 1.100 + url = eschtml(substr(t2, 1, RLENGTH - 1)); 1.101 + t2 = substr(t2, RLENGTH + 1); 1.102 + linktext = url; 1.103 + if(match(url, /@/) && !match(url, /^mailto:/)) 1.104 + url = "mailto:" url; 1.105 + return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2); 1.106 + } 1.107 + # Html tags 1.108 + if(match(t2, /^[A-Za-z\/!][^>]*>/)){ 1.109 + tag = tag substr(t2, RSTART, RLENGTH); 1.110 + t2 = substr(t2, RLENGTH + 1); 1.111 + return t1 tag nextil(t2); 1.112 + } 1.113 + return t1 "&lt;" nextil(t2); 1.114 + } 1.115 + # Html special entities 1.116 + if(tag == "&"){ 1.117 + if(match(t2, /^#?[A-Za-z0-9]+;/)){ 1.118 + tag = tag substr(t2, RSTART, RLENGTH); 1.119 + t2 = substr(t2, RLENGTH + 1); 1.120 + return t1 tag nextil(t2); 1.121 + } 1.122 + return t1 "&amp;" nextil(t2); 1.123 + } 1.124 + # Images 1.125 + if(tag == "!["){ 1.126 + if(!match(t2, /(\[.*\])|(\(.*\))/)) 1.127 + return t1 tag nextil(t2); 1.128 + match(t2, /^[^\]]*/); 1.129 + alt = substr(t2, 1, RLENGTH); 1.130 + t2 = substr(t2, RLENGTH + 2); 1.131 + if(match(t2, /^\(/)){ 1.132 + # Inline 1.133 + sub(/^\(/, "", t2); 1.134 + match(t2, /^[^\)]+/); 1.135 + url = eschtml(substr(t2, 1, RLENGTH)); 1.136 + t2 = substr(t2, RLENGTH + 2); 1.137 + title = ""; 1.138 + if(match(url, /[ ]+\".*\"[ ]*$/)) { 1.139 + title = substr(url, RSTART, RLENGTH); 1.140 + url = substr(url, 1, RSTART - 1); 1.141 + match(title, /\".*\"/); 1.142 + title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\""; 1.143 + } 1.144 + if(match(url, /^<.*>$/)) 1.145 + url = substr(url, 2, RLENGTH - 2); 1.146 + return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2); 1.147 + } 1.148 + else{ 1.149 + # Referenced 1.150 + sub(/^ ?\[/, "", t2); 1.151 + id = alt; 1.152 + if(match(t2, /^[^\]]+/)) 1.153 + id = substr(t2, 1, RLENGTH); 1.154 + t2 = substr(t2, RLENGTH + 2); 1.155 + if(ref[id]) 1.156 + r = ref[id]; 1.157 + else{ 1.158 + r = "<<" id; 1.159 + nr++; 1.160 + } 1.161 + return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2); 1.162 + } 1.163 + } 1.164 + # Links 1.165 + if(tag == "["){ 1.166 + if(!match(t2, /(\[.*\])|(\(.*\))/)) 1.167 + return t1 tag nextil(t2); 1.168 + match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/); 1.169 + linktext = substr(t2, 1, RLENGTH); 1.170 + t2 = substr(t2, RLENGTH + 2); 1.171 + if(match(t2, /^\(/)){ 1.172 + # Inline 1.173 + match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/); 1.174 + url = substr(t2, 2, RLENGTH - 1); 1.175 + pt2 = substr(t2, RLENGTH + 2); 1.176 + title = ""; 1.177 + if(match(url, /[ ]+\".*\"[ ]*$/)) { 1.178 + title = substr(url, RSTART, RLENGTH); 1.179 + url = substr(url, 1, RSTART - 1); 1.180 + match(title, /\".*\"/); 1.181 + title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\""; 1.182 + } 1.183 + if(match(url, /^<.*>$/)) 1.184 + url = substr(url, 2, RLENGTH - 2); 1.185 + url = eschtml(url); 1.186 + return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2); 1.187 + } 1.188 + else{ 1.189 + # Referenced 1.190 + sub(/^ ?\[/, "", t2); 1.191 + id = linktext; 1.192 + if(match(t2, /^[^\]]+/)) 1.193 + id = substr(t2, 1, RLENGTH); 1.194 + t2 = substr(t2, RLENGTH + 2); 1.195 + if(ref[id]) 1.196 + r = ref[id]; 1.197 + else{ 1.198 + r = "<<" id; 1.199 + nr++; 1.200 + } 1.201 + pt2 = t2; 1.202 + return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2); 1.203 + } 1.204 + } 1.205 + # Emphasis 1.206 + if(match(tag, /[*_]/)){ 1.207 + ntag = tag; 1.208 + if(sub("^" tag, "", t2)){ 1.209 + if(stag[ns] == tag && match(t2, "^" tag)) 1.210 + t2 = tag t2; 1.211 + else 1.212 + ntag = tag tag 1.213 + } 1.214 + n = length(ntag); 1.215 + tag = (n == 2) ? "strong" : "em"; 1.216 + if(match(t1, / $/) && match(t2, /^ /)) 1.217 + return t1 tag nextil(t2); 1.218 + if(stag[ns] == ntag){ 1.219 + tag = "/" tag; 1.220 + ns--; 1.221 + } 1.222 + else 1.223 + stag[++ns] = ntag; 1.224 + tag = "<" tag ">"; 1.225 + return t1 tag nextil(t2); 1.226 + } 1.227 +} 1.228 + 1.229 +function inline(t) { 1.230 + ilcode = 0; 1.231 + ilcode2 = 0; 1.232 + ns = 0; 1.233 + 1.234 + return nextil(t); 1.235 +} 1.236 + 1.237 +function printp(tag) { 1.238 + if(!match(text, /^[ ]*$/)){ 1.239 + text = inline(text); 1.240 + if(tag != "") 1.241 + oprint("<" tag ">" text "</" tag ">"); 1.242 + else 1.243 + oprint(text); 1.244 + } 1.245 + text = ""; 1.246 } 1.247 1.248 BEGIN { 1.249 - ni = 0; # inlines 1.250 - nl = 0; # nested lists 1.251 + blank = 0; 1.252 + code = 0; 1.253 + hr = 0; 1.254 + html = 0; 1.255 + nl = 0; 1.256 + nr = 0; 1.257 + otext = ""; 1.258 text = ""; 1.259 - block = "p"; 1.260 + par = "p"; 1.261 } 1.262 1.263 -# Escape html 1.264 -esc != "false" { 1.265 - gsub("&", "\\&amp;") 1.266 - gsub("<", "\\&lt;") 1.267 - gsub(">", "\\&gt;") 1.268 -} 1.269 +# References 1.270 +!code && /^ *\[[^\]]*\]:[ ]+/ { 1.271 + sub(/^ *\[/, ""); 1.272 + match($0, /\]/); 1.273 + id = substr($0, 1, RSTART - 1); 1.274 + sub(id "\\]:[ ]+", ""); 1.275 + title = ""; 1.276 + if(match($0, /\".*\"$/)) 1.277 + title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2); 1.278 + sub(/[ ]+\".*\"$/, ""); 1.279 + url = eschtml($0); 1.280 + ref[id] = url title; 1.281 1.282 -# Horizontal rules (_ is not in markdown) 1.283 -/^[ ]*([-*_] ?)+[ ]*$/ && text == "" { 1.284 - print "<hr>"; 1.285 + subref(id); 1.286 next; 1.287 } 1.288 1.289 -# Tables (not in markdown) 1.290 -# Syntax: 1.291 -# Right Align| Center Align |Left Align 1.292 -/([ ]\|)|(\|[ ])/ { 1.293 - if(block != "table") 1.294 - newblock("table"); 1.295 - nc = split($0, cells, "|"); 1.296 - $0 = "<tr>\n"; 1.297 - for(i = 1; i <= nc; i++){ 1.298 - align = "left"; 1.299 - if(sub(/^[ ]+/, "", cells[i])){ 1.300 - if(sub(/[ ]+$/, "", cells[i])) 1.301 - align = "center"; 1.302 - else 1.303 - align = "right"; 1.304 - } 1.305 - sub(/[ ]+$/,"", cells[i]); 1.306 - $0 = $0 "<td align=\"" align "\">" cells[i] "</td>\n"; 1.307 - } 1.308 - $0 = $0 "</tr>"; 1.309 +# html 1.310 +!html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ 1.311 +isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ { 1.312 + if(code) 1.313 + oprint("</pre></code>"); 1.314 + for(; !text && block[nl] == "blockquote"; nl--) 1.315 + oprint("</blockquote>"); 1.316 + match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ 1.317 + isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/); 1.318 + htag = substr($0, 2, RLENGTH - 1); 1.319 + if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)")) 1.320 + html = 1; 1.321 + if(html && match($0, /^<hr/)) 1.322 + hr = 1; 1.323 + oprint($0); 1.324 + next; 1.325 } 1.326 1.327 -# Ordered and unordered (possibly nested) lists 1.328 -/^[ ]*([*+-]|(([0-9]+[\.-]?)+))[ ]/ { 1.329 - newblock("li"); 1.330 - nnl = 1; 1.331 - while(match($0, /^[ ]/)){ 1.332 - sub(/^[ ]/,""); 1.333 - nnl++; 1.334 - } 1.335 - while(nl > nnl) 1.336 - print "</" list[nl--] ">"; 1.337 - while(nl < nnl){ 1.338 - list[++nl] = "ol"; 1.339 - if(match($0, /^[*+-]/)) 1.340 - list[nl] = "ul"; 1.341 - print "<" list[nl] ">"; 1.342 - } 1.343 - sub(/^([*+-]|(([0-9]+[\.-]?)+))[ ]/,""); 1.344 +html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ 1.345 +isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ || 1.346 +(hr && />$/)) { 1.347 + html = 0; 1.348 + hr = 0; 1.349 + oprint($0); 1.350 + next; 1.351 } 1.352 1.353 -# Multi line list items 1.354 -block == "li" { 1.355 - sub(/^( *)|( *)/,""); 1.356 +html { 1.357 + oprint($0); 1.358 + next; 1.359 +} 1.360 + 1.361 +# List and quote blocks 1.362 + 1.363 +# Remove indentation 1.364 +{ 1.365 + for(nnl = 0; nnl < nl; nnl++) 1.366 + if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \ 1.367 + (block[nnl + 1] == "blockquote" && !sub(/^> ?/, ""))) 1.368 + break; 1.369 +} 1.370 +nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; } 1.371 +# Quote blocks 1.372 +{ 1.373 + while(sub(/^> /, "")) 1.374 + nblock[++nnl] = "blockquote"; 1.375 +} 1.376 +# Horizontal rules 1.377 +{ hr = 0; } 1.378 +(blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ { 1.379 + if(code){ 1.380 + oprint("</pre></code>"); 1.381 + code = 0; 1.382 + } 1.383 + blank = 0; 1.384 + nnl = 0; 1.385 + hr = 1; 1.386 +} 1.387 +# List items 1.388 +block[nl] ~ /[ou]l/ && /^$/ { 1.389 + blank = 1; 1.390 + next; 1.391 +} 1.392 +{ newli = 0; } 1.393 +!hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ { 1.394 + sub(/^ ? ? ?[*+-]( +| )/, ""); 1.395 + nnl++; 1.396 + nblock[nnl] = "ul"; 1.397 + newli = 1; 1.398 +} 1.399 +(nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ { 1.400 + sub(/^ ? ? ?([0-9]+\.)+( +| )/, ""); 1.401 + nnl++; 1.402 + nblock[nnl] = "ol"; 1.403 + newli = 1; 1.404 +} 1.405 +newli { 1.406 + if(blank && nnl == nl && !par) 1.407 + par = "p"; 1.408 + blank = 0; 1.409 + printp(par); 1.410 + if(nnl == nl && block[nl] == nblock[nl]) 1.411 + oprint("</li><li>"); 1.412 +} 1.413 +blank && ! /^$/ { 1.414 + if(match(block[nnl], /[ou]l/) && !par) 1.415 + par = "p"; 1.416 + printp(par); 1.417 + par = "p"; 1.418 + blank = 0; 1.419 +} 1.420 + 1.421 +# Close old blocks and open new ones 1.422 +nnl != nl || nblock[nl] != block[nl] { 1.423 + if(code){ 1.424 + oprint("</pre></code>"); 1.425 + code = 0; 1.426 + } 1.427 + printp(par); 1.428 + b = (nnl > nl) ? nblock[nnl] : block[nnl]; 1.429 + par = (match(b, /[ou]l/)) ? "" : "p"; 1.430 +} 1.431 +nnl < nl || (nnl == nl && nblock[nl] != block[nl]) { 1.432 + for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){ 1.433 + if(match(block[nl], /[ou]l/)) 1.434 + oprint("</li>"); 1.435 + oprint("</" block[nl] ">"); 1.436 + } 1.437 +} 1.438 +nnl > nl { 1.439 + for(; nl < nnl; nl++){ 1.440 + block[nl + 1] = nblock[nl + 1]; 1.441 + oprint("<" block[nl + 1] ">"); 1.442 + if(match(block[nl + 1], /[ou]l/)) 1.443 + oprint("<li>"); 1.444 + } 1.445 +} 1.446 +hr { 1.447 + oprint("<hr>"); 1.448 + next; 1.449 } 1.450 1.451 # Code blocks 1.452 -/^( | )/ { 1.453 - if(block != "pre") 1.454 - newblock("pre"); 1.455 - sub(/^( | )/, ""); 1.456 - text = text $0 "\n"; 1.457 +code && /^$/ { 1.458 + if(blanK) 1.459 + oprint(""); 1.460 + blank = 1; 1.461 + next; 1.462 +} 1.463 +!text && sub(/^( | )/, "") { 1.464 + if(blanK) 1.465 + oprint(""); 1.466 + blank = 0; 1.467 + if(!code) 1.468 + oprint("<code><pre>"); 1.469 + code = 1; 1.470 + $0 = eschtml($0); 1.471 + oprint($0); 1.472 + next; 1.473 +} 1.474 +code { 1.475 + oprint("</pre></code>"); 1.476 + code = 0; 1.477 +} 1.478 + 1.479 +# Setex-style Headers 1.480 +text && /^=+$/ {printp("h1"); next;} 1.481 +text && /^-+$/ {printp("h2"); next;} 1.482 + 1.483 +# Atx-Style headers 1.484 +/^#+/ && (!newli || par=="p" || /^##/) { 1.485 + for(n = 0; n < 6 && sub(/^# */, ""); n++) 1.486 + sub(/#$/, ""); 1.487 + par = "h" n; 1.488 +} 1.489 + 1.490 +# Paragraph 1.491 +/^$/ { 1.492 + printp(par); 1.493 + par = "p"; 1.494 next; 1.495 } 1.496 1.497 -# Paragraph 1.498 -/^$/ { 1.499 - newblock(); 1.500 - while(nl > 0) 1.501 - print "</" list[nl--] ">"; 1.502 -} 1.503 - 1.504 -# Setex-style Headers 1.505 -# (Plus h3 with underscores.) 1.506 -/^=+$/ { 1.507 - block = "h" 1; 1.508 - next; 1.509 -} 1.510 - 1.511 -/^-+$/ { 1.512 - block = "h" 2; 1.513 - next; 1.514 -} 1.515 - 1.516 -/^_+$/ { 1.517 - block = "h" 3; 1.518 - next; 1.519 -} 1.520 - 1.521 -# Atx-style headers 1.522 -/^#/ { 1.523 - newblock(); 1.524 - match($0, /#+/); 1.525 - n = RLENGTH; 1.526 - if(n > 6) 1.527 - n = 6; 1.528 - text = substr($0, RLENGTH + 1); 1.529 - block = "h" n; 1.530 - next; 1.531 -} 1.532 - 1.533 -// { 1.534 - # Images 1.535 - while(match($0, /!\[[^\]]+\]\([^\)]+\)/)){ 1.536 - split(substr($0, RSTART + 2, RLENGTH - 3), a, /\]\(/); 1.537 - sub(/!\[[^\]]+\]\([^\)]+\)/, "<img src=\"" a[2] "\" alt=\"" a[1] "\">"); 1.538 - } 1.539 - # Links 1.540 - while(match($0, /\[[^\]]+\]\([^\)]+\)/)){ 1.541 - split(substr($0, RSTART + 1, RLENGTH - 2), a, /\]\(/); 1.542 - sub(/\[[^\]]+\]\([^\)]+\)/, dolink(a[2], a[1])); 1.543 - } 1.544 - # Auto links (uri matching is poor) 1.545 - na = split($0, a, /(^\()|[ ]|([,\.\)]([ ]|$))/); 1.546 - for(i = 1; i <= na; i++) 1.547 - if(match(a[i], /^(((https?|ftp|file|news|irc):\/\/)|(mailto:)).+$/)) 1.548 - sub(a[i], dolink(a[i], a[i])); 1.549 - # Inline 1.550 - subinline("(\\*\\*)|(__)", "strong"); 1.551 - subinline("\\*", "em"); 1.552 - subinline("`", "code"); 1.553 - text = text (text ? " " : "") $0; 1.554 -} 1.555 +# Add text 1.556 +{ text = (text ? text " " : "") $0; } 1.557 1.558 END { 1.559 - while(ni > 0) 1.560 - text = text "</" inline[ni--] ">"; 1.561 - newblock(); 1.562 - while(nl > 0) 1.563 - print "</" list[nl--] ">"; 1.564 + if(code){ 1.565 + oprint("</pre></code>"); 1.566 + code = 0; 1.567 + } 1.568 + printp(par); 1.569 + for(; nl > 0; nl--){ 1.570 + if(match(block[nl], /[ou]l/)) 1.571 + oprint("</li>"); 1.572 + oprint("</" block[nl] ">"); 1.573 + } 1.574 + gsub(/<<[^\"]*/, "", otext); 1.575 + print(otext); 1.576 }