diff options
| -rw-r--r-- | bg/ResponseMetaData.js | 57 | ||||
| -rw-r--r-- | bg/ResponseProcessor.js | 24 | 
2 files changed, 51 insertions, 30 deletions
| diff --git a/bg/ResponseMetaData.js b/bg/ResponseMetaData.js index 41d1fe9..0f768fe 100644 --- a/bg/ResponseMetaData.js +++ b/bg/ResponseMetaData.js @@ -25,6 +25,9 @@    to parse textual data through a decoder.  */ +const BOM = [0xEF, 0xBB, 0xBF]; +const DECODER_PARAMS = {stream: true}; +  class ResponseMetaData {    constructor(request) {      let {responseHeaders} = request; @@ -37,7 +40,7 @@ class ResponseMetaData {          this.headers[propertyName] = h;        }      } -    this.forcedUTF8 = false; +    this.computedCharset = "";    }    get charset() { @@ -49,34 +52,54 @@ class ResponseMetaData {        }      }      Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true }); -    return charset; +    return this.computedCharset = charset;    } -  get isUTF8() { -    return /^utf-?8$/i.test(this.charset); -  } +  decode(data) { +    let charset = this.charset; +    let decoder = this.createDecoder(); +    let text = decoder.decode(data, DECODER_PARAMS); +    if (!charset && /html/i.test(this.contentType)) { +      // missing HTTP charset, sniffing in content... -  forceUTF8() { -    if (!(this.forcedUTF8 || this.isUTF8)) { -      let h = this.headers.contentType; -      if (h) { -        h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8"); -        this.forcedUTF8 = true; -      } // if the header doesn't exist the browser should default to UTF-8 anyway +      if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) { +        // forced UTF-8, nothing to do +        return text; +      } + +      // let's try figuring out the charset from <meta> tags +      let parser = new DOMParser(); +      let doc = parser.parseFromString(text, "text/html"); +      let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]'); +      for (let m of meta) { +        charset = m.getAttribute("charset"); +        if (!charset) { +          let match = m.getAttribute("content").match(/;\s*charset\s*=\s*([\w-]+)/i) +          if (match) charset = match[1]; +        } +        if (charset) { +          decoder = this.createDecoder(charset, null); +          if (decoder) { +            this.computedCharset = charset; +            return decoder.decode(data, DECODER_PARAMS); +          } +        } +      }      } -    return this.forcedUTF8; +    return text;    } -  createDecoder() { -    if (this.charset) { +  createDecoder(charset = this.charset, def = "latin1") { +    if (charset) {        try { -        return new TextDecoder(this.charset); +        return new TextDecoder(charset);        } catch (e) {          console.error(e);        }      } -    return new TextDecoder("utf-8"); +    return def ? new TextDecoder(def) : null;    }  }; +ResponseMetaData.UTF8BOM = new Uint8Array(BOM);  module.exports = { ResponseMetaData }; diff --git a/bg/ResponseProcessor.js b/bg/ResponseProcessor.js index 4443d90..1aa89de 100644 --- a/bg/ResponseProcessor.js +++ b/bg/ResponseProcessor.js @@ -90,8 +90,6 @@ class ResponseTextFilter {      };      filter.onstop = async event => { - -      let params = {stream: true};        // concatenate chunks        let size = buffer.reduce((sum, chunk, n) => sum + chunk.byteLength, 0)        let allBytes = new Uint8Array(size); @@ -108,10 +106,10 @@ class ResponseTextFilter {            response.text = await (await fetch(request.url, {cache: "reload", credentials: "include"})).text();          } else {            console.debug("It's a %s, trying to decode it as UTF-16.", request.type); -          response.text = new TextDecoder("utf-16be").decode(allBytes); +          response.text = new TextDecoder("utf-16be").decode(allBytes, {stream: true});          }        } else { -        response.text = metaData.createDecoder().decode(allBytes, {stream: true}); +        response.text = metaData.decode(allBytes);        }        let editedText = null;        try { @@ -119,19 +117,19 @@ class ResponseTextFilter {        } catch(e) {          console.error(e);        } -      if (editedText !== null && -        (metaData.forcedUTF8 && request.type !== "script" || -          response.text !== editedText)) { -        // if we changed the charset, the text or both, let's re-encode -        filter.write(new TextEncoder().encode(editedText)); -      } else { -        // ... otherwise pass all the raw bytes through -        filter.write(allBytes); +      if (editedText !== null) { +        // we changed the content, let's re-encode +        let encoded = new TextEncoder().encode(editedText); +        // pre-pending the UTF-8 BOM will force the charset per HTML 5 specs +        allBytes = new Uint8Array(encoded.byteLength + 3); +        allBytes.set(ResponseMetaData.UTF8BOM, 0); // UTF-8 BOM +        allBytes.set(encoded, 3);        } +      filter.write(allBytes);        filter.close();      } -    return metaData.forceUTF8() ? {responseHeaders} : ResponseProcessor.ACCEPT;; +    return ResponseProcessor.ACCEPT;    }  } | 
