md2html.awk

log tags | changeset raw browse | file diff annotate file log raw

view md2html.awk @ 15:fa51423d5292

Back to the default branch
author yiyus@1936
date Tue Jul 21 17:25:19 2009 +0200 (2009-07-21 ago)
parents 8d0cd6f66ee7
children

1 #!/bin/awk -f
2 #
3 # by: Jesus Galan (yiyus) 2009
4 #
5 # Usage: md2html.awk file.md > file.html
6 # See: http://4l77.com/src/md2html.awk
8 function eschtml(t) {
9 gsub("&", "\\&", t);
10 gsub("<", "\\&lt;", t);
11 return t;
12 }
14 function oprint(t){
15 if(nr == 0)
16 print t;
17 else
18 otext = otext "\n" t;
19 }
21 function subref(id){
22 for(; nr > 0 && sub("<<" id, ref[id], otext); nr--);
23 if(nr == 0 && otext) {
24 print otext;
25 otext = "";
26 }
27 }
29 function nextil(t) {
30 if(!match(t, /[`<&\[*_\\-]|(\!\[)/))
31 return t;
32 t1 = substr(t, 1, RSTART - 1);
33 tag = substr(t, RSTART, RLENGTH);
34 t2 = substr(t, RSTART + RLENGTH);
35 if(ilcode && tag != "`")
36 return eschtml(t1 tag) nextil(t2);
37 # Backslash escaping
38 if(tag == "\\"){
39 if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){
40 tag = substr(t2, 1, 1);
41 t2 = substr(t2, 2);
42 }
43 return t1 tag nextil(t2);
44 }
45 # Dashes
46 if(tag == "-"){
47 if(sub(/^-/, "", t2))
48 tag = "&#8212;";
49 return t1 tag nextil(t2);
50 }
51 # Inline Code
52 if(tag == "`"){
53 if(sub(/^`/, "", t2)){
54 if(!match(t2, /``/))
55 return t1 "&#8221;" nextil(t2);
56 ilcode2 = !ilcode2;
57 }
58 else if(ilcode2)
59 return t1 tag nextil(t2);
60 tag = "<code>";
61 if(ilcode){
62 t1 = eschtml(t1);
63 tag = "</code>";
64 }
65 ilcode = !ilcode;
66 return t1 tag nextil(t2);
67 }
68 if(tag == "<"){
69 # Autolinks
70 if(match(t2, /^[^ ]+[\.@][^ ]+>/)){
71 url = eschtml(substr(t2, 1, RLENGTH - 1));
72 t2 = substr(t2, RLENGTH + 1);
73 linktext = url;
74 if(match(url, /@/) && !match(url, /^mailto:/))
75 url = "mailto:" url;
76 return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2);
77 }
78 # Html tags
79 if(match(t2, /^[A-Za-z\/!][^>]*>/)){
80 tag = tag substr(t2, RSTART, RLENGTH);
81 t2 = substr(t2, RLENGTH + 1);
82 return t1 tag nextil(t2);
83 }
84 return t1 "&lt;" nextil(t2);
85 }
86 # Html special entities
87 if(tag == "&"){
88 if(match(t2, /^#?[A-Za-z0-9]+;/)){
89 tag = tag substr(t2, RSTART, RLENGTH);
90 t2 = substr(t2, RLENGTH + 1);
91 return t1 tag nextil(t2);
92 }
93 return t1 "&amp;" nextil(t2);
94 }
95 # Images
96 if(tag == "!["){
97 if(!match(t2, /(\[.*\])|(\(.*\))/))
98 return t1 tag nextil(t2);
99 match(t2, /^[^\]]*/);
100 alt = substr(t2, 1, RLENGTH);
101 t2 = substr(t2, RLENGTH + 2);
102 if(match(t2, /^\(/)){
103 # Inline
104 sub(/^\(/, "", t2);
105 match(t2, /^[^\)]+/);
106 url = eschtml(substr(t2, 1, RLENGTH));
107 t2 = substr(t2, RLENGTH + 2);
108 title = "";
109 if(match(url, /[ ]+\".*\"[ ]*$/)) {
110 title = substr(url, RSTART, RLENGTH);
111 url = substr(url, 1, RSTART - 1);
112 match(title, /\".*\"/);
113 title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
114 }
115 if(match(url, /^<.*>$/))
116 url = substr(url, 2, RLENGTH - 2);
117 return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2);
118 }
119 else{
120 # Referenced
121 sub(/^ ?\[/, "", t2);
122 id = alt;
123 if(match(t2, /^[^\]]+/))
124 id = substr(t2, 1, RLENGTH);
125 t2 = substr(t2, RLENGTH + 2);
126 if(ref[id])
127 r = ref[id];
128 else{
129 r = "<<" id;
130 nr++;
131 }
132 return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2);
133 }
134 }
135 # Links
136 if(tag == "["){
137 if(!match(t2, /(\[.*\])|(\(.*\))/))
138 return t1 tag nextil(t2);
139 match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/);
140 linktext = substr(t2, 1, RLENGTH);
141 t2 = substr(t2, RLENGTH + 2);
142 if(match(t2, /^\(/)){
143 # Inline
144 match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/);
145 url = substr(t2, 2, RLENGTH - 1);
146 pt2 = substr(t2, RLENGTH + 2);
147 title = "";
148 if(match(url, /[ ]+\".*\"[ ]*$/)) {
149 title = substr(url, RSTART, RLENGTH);
150 url = substr(url, 1, RSTART - 1);
151 match(title, /\".*\"/);
152 title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
153 }
154 if(match(url, /^<.*>$/))
155 url = substr(url, 2, RLENGTH - 2);
156 url = eschtml(url);
157 return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2);
158 }
159 else{
160 # Referenced
161 sub(/^ ?\[/, "", t2);
162 id = linktext;
163 if(match(t2, /^[^\]]+/))
164 id = substr(t2, 1, RLENGTH);
165 t2 = substr(t2, RLENGTH + 2);
166 if(ref[id])
167 r = ref[id];
168 else{
169 r = "<<" id;
170 nr++;
171 }
172 pt2 = t2;
173 return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2);
174 }
175 }
176 # Emphasis
177 if(match(tag, /[*_]/)){
178 ntag = tag;
179 if(sub("^" tag, "", t2)){
180 if(stag[ns] == tag && match(t2, "^" tag))
181 t2 = tag t2;
182 else
183 ntag = tag tag
184 }
185 n = length(ntag);
186 tag = (n == 2) ? "strong" : "em";
187 if(match(t1, / $/) && match(t2, /^ /))
188 return t1 tag nextil(t2);
189 if(stag[ns] == ntag){
190 tag = "/" tag;
191 ns--;
192 }
193 else
194 stag[++ns] = ntag;
195 tag = "<" tag ">";
196 return t1 tag nextil(t2);
197 }
198 }
200 function inline(t) {
201 ilcode = 0;
202 ilcode2 = 0;
203 ns = 0;
205 return nextil(t);
206 }
208 function printp(tag) {
209 if(!match(text, /^[ ]*$/)){
210 text = inline(text);
211 if(tag != "")
212 oprint("<" tag ">" text "</" tag ">");
213 else
214 oprint(text);
215 }
216 text = "";
217 }
219 BEGIN {
220 blank = 0;
221 code = 0;
222 hr = 0;
223 html = 0;
224 nl = 0;
225 nr = 0;
226 otext = "";
227 text = "";
228 par = "p";
229 }
231 # References
232 !code && /^ *\[[^\]]*\]:[ ]+/ {
233 sub(/^ *\[/, "");
234 match($0, /\]/);
235 id = substr($0, 1, RSTART - 1);
236 sub(id "\\]:[ ]+", "");
237 title = "";
238 if(match($0, /\".*\"$/))
239 title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2);
240 sub(/[ ]+\".*\"$/, "");
241 url = eschtml($0);
242 ref[id] = url title;
244 subref(id);
245 next;
246 }
248 # html
249 !html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
250 isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ {
251 if(code)
252 oprint("</pre></code>");
253 for(; !text && block[nl] == "blockquote"; nl--)
254 oprint("</blockquote>");
255 match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
256 isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/);
257 htag = substr($0, 2, RLENGTH - 1);
258 if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)"))
259 html = 1;
260 if(html && match($0, /^<hr/))
261 hr = 1;
262 oprint($0);
263 next;
264 }
266 html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
267 isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ ||
268 (hr && />$/)) {
269 html = 0;
270 hr = 0;
271 oprint($0);
272 next;
273 }
275 html {
276 oprint($0);
277 next;
278 }
280 # List and quote blocks
282 # Remove indentation
283 {
284 for(nnl = 0; nnl < nl; nnl++)
285 if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \
286 (block[nnl + 1] == "blockquote" && !sub(/^> ?/, "")))
287 break;
288 }
289 nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; }
290 # Quote blocks
291 {
292 while(sub(/^> /, ""))
293 nblock[++nnl] = "blockquote";
294 }
295 # Horizontal rules
296 { hr = 0; }
297 (blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ {
298 if(code){
299 oprint("</pre></code>");
300 code = 0;
301 }
302 blank = 0;
303 nnl = 0;
304 hr = 1;
305 }
306 # List items
307 block[nl] ~ /[ou]l/ && /^$/ {
308 blank = 1;
309 next;
310 }
311 { newli = 0; }
312 !hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ {
313 sub(/^ ? ? ?[*+-]( +| )/, "");
314 nnl++;
315 nblock[nnl] = "ul";
316 newli = 1;
317 }
318 (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ {
319 sub(/^ ? ? ?([0-9]+\.)+( +| )/, "");
320 nnl++;
321 nblock[nnl] = "ol";
322 newli = 1;
323 }
324 newli {
325 if(blank && nnl == nl && !par)
326 par = "p";
327 blank = 0;
328 printp(par);
329 if(nnl == nl && block[nl] == nblock[nl])
330 oprint("</li><li>");
331 }
332 blank && ! /^$/ {
333 if(match(block[nnl], /[ou]l/) && !par)
334 par = "p";
335 printp(par);
336 par = "p";
337 blank = 0;
338 }
340 # Close old blocks and open new ones
341 nnl != nl || nblock[nl] != block[nl] {
342 if(code){
343 oprint("</pre></code>");
344 code = 0;
345 }
346 printp(par);
347 b = (nnl > nl) ? nblock[nnl] : block[nnl];
348 par = (match(b, /[ou]l/)) ? "" : "p";
349 }
350 nnl < nl || (nnl == nl && nblock[nl] != block[nl]) {
351 for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){
352 if(match(block[nl], /[ou]l/))
353 oprint("</li>");
354 oprint("</" block[nl] ">");
355 }
356 }
357 nnl > nl {
358 for(; nl < nnl; nl++){
359 block[nl + 1] = nblock[nl + 1];
360 oprint("<" block[nl + 1] ">");
361 if(match(block[nl + 1], /[ou]l/))
362 oprint("<li>");
363 }
364 }
365 hr {
366 oprint("<hr>");
367 next;
368 }
370 # Code blocks
371 code && /^$/ {
372 if(blanK)
373 oprint("");
374 blank = 1;
375 next;
376 }
377 !text && sub(/^( | )/, "") {
378 if(blanK)
379 oprint("");
380 blank = 0;
381 if(!code)
382 oprint("<code><pre>");
383 code = 1;
384 $0 = eschtml($0);
385 oprint($0);
386 next;
387 }
388 code {
389 oprint("</pre></code>");
390 code = 0;
391 }
393 # Setex-style Headers
394 text && /^=+$/ {printp("h1"); next;}
395 text && /^-+$/ {printp("h2"); next;}
397 # Atx-Style headers
398 /^#+/ && (!newli || par=="p" || /^##/) {
399 for(n = 0; n < 6 && sub(/^# */, ""); n++)
400 sub(/#$/, "");
401 par = "h" n;
402 }
404 # Paragraph
405 /^$/ {
406 printp(par);
407 par = "p";
408 next;
409 }
411 # Add text
412 { text = (text ? text " " : "") $0; }
414 END {
415 if(code){
416 oprint("</pre></code>");
417 code = 0;
418 }
419 printp(par);
420 for(; nl > 0; nl--){
421 if(match(block[nl], /[ou]l/))
422 oprint("</li>");
423 oprint("</" block[nl] ">");
424 }
425 gsub(/<<[^\"]*/, "", otext);
426 print(otext);
427 }