1 | // Copyright 2013 The Go Authors. All rights reserved. |
---|---|
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // This program takes an HTML file and outputs a corresponding article file in |
6 | // present format. See: golang.org/x/tools/present |
7 | package main // import "golang.org/x/tools/cmd/html2article" |
8 | |
9 | import ( |
10 | "bytes" |
11 | "errors" |
12 | "flag" |
13 | "fmt" |
14 | "io" |
15 | "log" |
16 | "net/url" |
17 | "os" |
18 | "regexp" |
19 | "strings" |
20 | |
21 | "golang.org/x/net/html" |
22 | "golang.org/x/net/html/atom" |
23 | ) |
24 | |
25 | func main() { |
26 | flag.Parse() |
27 | |
28 | err := convert(os.Stdout, os.Stdin) |
29 | if err != nil { |
30 | log.Fatal(err) |
31 | } |
32 | } |
33 | |
34 | func convert(w io.Writer, r io.Reader) error { |
35 | root, err := html.Parse(r) |
36 | if err != nil { |
37 | return err |
38 | } |
39 | |
40 | style := find(root, isTag(atom.Style)) |
41 | if err := parseStyles(style); err != nil { |
42 | log.Printf("couldn't parse all styles: %v", err) |
43 | } |
44 | |
45 | body := find(root, isTag(atom.Body)) |
46 | if body == nil { |
47 | return errors.New("couldn't find body") |
48 | } |
49 | article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) |
50 | _, err = fmt.Fprintf(w, "Title\n\n%s", article) |
51 | return err |
52 | } |
53 | |
54 | type Style string |
55 | |
56 | const ( |
57 | Bold Style = "*" |
58 | Italic Style = "_" |
59 | Code Style = "`" |
60 | ) |
61 | |
62 | var cssRules = make(map[string]Style) |
63 | |
64 | func parseStyles(style *html.Node) error { |
65 | if style == nil || style.FirstChild == nil { |
66 | return errors.New("couldn't find styles") |
67 | } |
68 | |
69 | styles := style.FirstChild.Data |
70 | readUntil := func(end rune) (string, bool) { |
71 | i := strings.IndexRune(styles, end) |
72 | if i < 0 { |
73 | return "", false |
74 | } |
75 | s := styles[:i] |
76 | styles = styles[i:] |
77 | return s, true |
78 | } |
79 | |
80 | for { |
81 | sel, ok := readUntil('{') |
82 | if !ok && sel == "" { |
83 | break |
84 | } else if !ok { |
85 | return fmt.Errorf("could not parse selector %q", styles) |
86 | } |
87 | |
88 | value, ok := readUntil('}') |
89 | if !ok { |
90 | return fmt.Errorf("couldn't parse style body for %s", sel) |
91 | } |
92 | switch { |
93 | case strings.Contains(value, "italic"): |
94 | cssRules[sel] = Italic |
95 | case strings.Contains(value, "bold"): |
96 | cssRules[sel] = Bold |
97 | case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"): |
98 | cssRules[sel] = Code |
99 | } |
100 | } |
101 | return nil |
102 | } |
103 | |
104 | var newlineRun = regexp.MustCompile(`\n\n+`) |
105 | |
106 | func limitNewlineRuns(s string) string { |
107 | return newlineRun.ReplaceAllString(s, "\n\n") |
108 | } |
109 | |
110 | func makeHeadings(body string) string { |
111 | buf := new(bytes.Buffer) |
112 | lines := strings.Split(body, "\n") |
113 | for i, s := range lines { |
114 | if i == 0 && !isBoldTitle(s) { |
115 | buf.WriteString("* Introduction\n\n") |
116 | } |
117 | if isBoldTitle(s) { |
118 | s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) |
119 | s = "* " + s |
120 | } |
121 | buf.WriteString(s) |
122 | buf.WriteByte('\n') |
123 | } |
124 | return buf.String() |
125 | } |
126 | |
127 | func isBoldTitle(s string) bool { |
128 | return !strings.Contains(s, " ") && |
129 | strings.HasPrefix(s, "*") && |
130 | strings.HasSuffix(s, "*") |
131 | } |
132 | |
133 | func indent(buf *bytes.Buffer, s string) { |
134 | for _, l := range strings.Split(s, "\n") { |
135 | if l != "" { |
136 | buf.WriteByte('\t') |
137 | buf.WriteString(l) |
138 | } |
139 | buf.WriteByte('\n') |
140 | } |
141 | } |
142 | |
143 | func unwrap(buf *bytes.Buffer, s string) { |
144 | var cont bool |
145 | for _, l := range strings.Split(s, "\n") { |
146 | l = strings.TrimSpace(l) |
147 | if len(l) == 0 { |
148 | if cont { |
149 | buf.WriteByte('\n') |
150 | buf.WriteByte('\n') |
151 | } |
152 | cont = false |
153 | } else { |
154 | if cont { |
155 | buf.WriteByte(' ') |
156 | } |
157 | buf.WriteString(l) |
158 | cont = true |
159 | } |
160 | } |
161 | } |
162 | |
163 | func text(n *html.Node) string { |
164 | var buf bytes.Buffer |
165 | walk(n, func(n *html.Node) bool { |
166 | switch n.Type { |
167 | case html.TextNode: |
168 | buf.WriteString(n.Data) |
169 | return false |
170 | case html.ElementNode: |
171 | // no-op |
172 | default: |
173 | return true |
174 | } |
175 | a := n.DataAtom |
176 | if a == atom.Span { |
177 | switch { |
178 | case hasStyle(Code)(n): |
179 | a = atom.Code |
180 | case hasStyle(Bold)(n): |
181 | a = atom.B |
182 | case hasStyle(Italic)(n): |
183 | a = atom.I |
184 | } |
185 | } |
186 | switch a { |
187 | case atom.Br: |
188 | buf.WriteByte('\n') |
189 | case atom.P: |
190 | unwrap(&buf, childText(n)) |
191 | buf.WriteString("\n\n") |
192 | case atom.Li: |
193 | buf.WriteString("- ") |
194 | unwrap(&buf, childText(n)) |
195 | buf.WriteByte('\n') |
196 | case atom.Pre: |
197 | indent(&buf, childText(n)) |
198 | buf.WriteByte('\n') |
199 | case atom.A: |
200 | href, text := attr(n, "href"), childText(n) |
201 | // Skip links with no text. |
202 | if strings.TrimSpace(text) == "" { |
203 | break |
204 | } |
205 | // Don't emit empty links. |
206 | if strings.TrimSpace(href) == "" { |
207 | buf.WriteString(text) |
208 | break |
209 | } |
210 | // Use original url for Google Docs redirections. |
211 | if u, err := url.Parse(href); err != nil { |
212 | log.Printf("parsing url %q: %v", href, err) |
213 | } else if u.Host == "www.google.com" && u.Path == "/url" { |
214 | href = u.Query().Get("q") |
215 | } |
216 | fmt.Fprintf(&buf, "[[%s][%s]]", href, text) |
217 | case atom.Code: |
218 | buf.WriteString(highlight(n, "`")) |
219 | case atom.B: |
220 | buf.WriteString(highlight(n, "*")) |
221 | case atom.I: |
222 | buf.WriteString(highlight(n, "_")) |
223 | case atom.Img: |
224 | src := attr(n, "src") |
225 | fmt.Fprintf(&buf, ".image %s\n", src) |
226 | case atom.Iframe: |
227 | src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") |
228 | fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) |
229 | case atom.Param: |
230 | if attr(n, "name") == "movie" { |
231 | // Old style YouTube embed. |
232 | u := attr(n, "value") |
233 | u = strings.Replace(u, "/v/", "/embed/", 1) |
234 | if i := strings.Index(u, "&"); i >= 0 { |
235 | u = u[:i] |
236 | } |
237 | fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) |
238 | } |
239 | case atom.Title: |
240 | default: |
241 | return true |
242 | } |
243 | return false |
244 | }) |
245 | return buf.String() |
246 | } |
247 | |
248 | func childText(node *html.Node) string { |
249 | var buf bytes.Buffer |
250 | for n := node.FirstChild; n != nil; n = n.NextSibling { |
251 | fmt.Fprint(&buf, text(n)) |
252 | } |
253 | return buf.String() |
254 | } |
255 | |
256 | func highlight(node *html.Node, char string) string { |
257 | t := strings.Replace(childText(node), " ", char, -1) |
258 | return fmt.Sprintf("%s%s%s", char, t, char) |
259 | } |
260 | |
261 | type selector func(*html.Node) bool |
262 | |
263 | func isTag(a atom.Atom) selector { |
264 | return func(n *html.Node) bool { |
265 | return n.DataAtom == a |
266 | } |
267 | } |
268 | |
269 | func hasClass(name string) selector { |
270 | return func(n *html.Node) bool { |
271 | for _, a := range n.Attr { |
272 | if a.Key == "class" { |
273 | for _, c := range strings.Fields(a.Val) { |
274 | if c == name { |
275 | return true |
276 | } |
277 | } |
278 | } |
279 | } |
280 | return false |
281 | } |
282 | } |
283 | |
284 | func hasStyle(s Style) selector { |
285 | return func(n *html.Node) bool { |
286 | for rule, s2 := range cssRules { |
287 | if s2 != s { |
288 | continue |
289 | } |
290 | if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { |
291 | return true |
292 | } |
293 | if n.DataAtom.String() == rule { |
294 | return true |
295 | } |
296 | } |
297 | return false |
298 | } |
299 | } |
300 | |
301 | func attr(node *html.Node, key string) (value string) { |
302 | for _, attr := range node.Attr { |
303 | if attr.Key == key { |
304 | return attr.Val |
305 | } |
306 | } |
307 | return "" |
308 | } |
309 | |
310 | func find(n *html.Node, fn selector) *html.Node { |
311 | var result *html.Node |
312 | walk(n, func(n *html.Node) bool { |
313 | if result != nil { |
314 | return false |
315 | } |
316 | if fn(n) { |
317 | result = n |
318 | return false |
319 | } |
320 | return true |
321 | }) |
322 | return result |
323 | } |
324 | |
325 | func walk(n *html.Node, fn selector) { |
326 | if fn(n) { |
327 | for c := n.FirstChild; c != nil; c = c.NextSibling { |
328 | walk(c, fn) |
329 | } |
330 | } |
331 | } |
332 |
Members