bg/ResponseMetaData.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

/**
* GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript.
*
* Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
*
* This file is part of GNU LibreJS.
*
* GNU LibreJS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* GNU LibreJS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU LibreJS.  If not, see <http://www.gnu.org/licenses/>.
*/

/**
  This class parses HTTP response headers to extract both the
  MIME Content-type and the character set to be used, if specified,
  to parse textual data through a decoder.
*/

const BOM = [0xEF, 0xBB, 0xBF];
const DECODER_PARAMS = { stream: true };

class ResponseMetaData {
  constructor(request) {
    let { responseHeaders } = request;
    this.headers = {};
    for (let h of responseHeaders) {
      if (/^\s*Content-(Type|Disposition)\s*$/i.test(h.name)) {
        let propertyName = h.name.split("-")[1].trim();
        propertyName = `content${propertyName.charAt(0).toUpperCase()}${propertyName.substring(1).toLowerCase()}`;
        this[propertyName] = h.value;
        this.headers[propertyName] = h;
      }
    }
    this.computedCharset = "";
  }

  get charset() {
    let charset = "";
    if (this.contentType) {
      let m = this.contentType.match(/;\s*charset\s*=\s*(\S+)/);
      if (m) {
        charset = m[1];
      }
    }
    Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });
    return this.computedCharset = charset;
  }

  decode(data) {
    let charset = this.charset;
    let decoder = this.createDecoder();
    let text = decoder.decode(data, DECODER_PARAMS);
    if (!charset && /html/i.test(this.contentType)) {
      // missing HTTP charset, sniffing in content...

      if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) {
        // forced UTF-8, nothing to do
        return text;
      }

      // let's try figuring out the charset from <meta> tags
      let parser = new DOMParser();
      let doc = parser.parseFromString(text, "text/html");
      let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]');
      for (let m of meta) {
        charset = m.getAttribute("charset");
        if (!charset) {
          let match = m.getAttribute("content").match(/;\s*charset\s*=\s*([\w-]+)/i)
          if (match) charset = match[1];
        }
        if (charset) {
          decoder = this.createDecoder(charset, null);
          if (decoder) {
            this.computedCharset = charset;
            return decoder.decode(data, DECODER_PARAMS);
          }
        }
      }
    }
    return text;
  }

  createDecoder(charset = this.charset, def = "latin1") {
    if (charset) {
      try {
        return new TextDecoder(charset);
      } catch (e) {
        console.error(e);
      }
    }
    return def ? new TextDecoder(def) : null;
  }
};
ResponseMetaData.UTF8BOM = new Uint8Array(BOM);

module.exports = { ResponseMetaData };