summaryrefslogtreecommitdiff
path: root/purify_html
blob: 9c3a7862a82745f6d4de66ffc41db5812be8170a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/sh

# Mangle the rendered files to cause fewer differences after re-rendering.

# Written by Thomas Schwinge <thomas@schwinge.name>.

# Un-mangle mailto links: convert HTML character entities to real characters.
find ./ -name \*.html -print0 \
  | xargs -0 --no-run-if-empty -n 1 \
    perl -e \
      'BEGIN {
        $file = $ARGV[0];
        $discard = 1;
        $replacing = 0;

        # TODO: could use a proper temporary file.
        open(OUT, ">$file.new") or die "open: $file: $!";
        select(OUT) or die "select: $file: $!";
      }

      while (<>) {
        # The replacing-toggling logic is a bit rough, but so is life.
        $replacing = 1 if /<a href="mailto:/;
        s%\&#(x?)([^;]*);%$discard = 0; chr(length($1) ? hex($2) : $2);%eg if $replacing;
        $replacing = 0 if /<\/a>/;
      } continue {
        print or die "print: $file: $!";
      }

      END {
        if ($discard) {
          unlink("$file.new") or die "unlink: $file: $!";
        } else {
          rename("$file.new", $file) or die "rename: $file: $!";
        }
      }'

# Compared to using ``perl -p -i -l'', this solution maintains the files'
# original timestamps unless they're actually modified.