diff options
author | Ruben Rodriguez <ruben@gnu.org> | 2019-03-13 21:51:01 +0000 |
---|---|---|
committer | Ruben Rodriguez <ruben@gnu.org> | 2019-03-13 21:51:01 +0000 |
commit | 55469c349c3c47c882ee21348ba67780c8291003 (patch) | |
tree | a7aa8fa171eaa79747a2fe231006d9540458cb31 /bg | |
parent | 1e4ce528dec55b79501e15ef08b751b4f533d756 (diff) | |
parent | 50d4b4ba0207b7fd7ece2f1797ff87795e5bc064 (diff) |
Merge #36 `Correctly decode using the original document charset and force re-encoding via UTF-8 BOM only when needed.`
Diffstat (limited to 'bg')
-rw-r--r-- | bg/ResponseMetaData.js | 57 | ||||
-rw-r--r-- | bg/ResponseProcessor.js | 24 |
2 files changed, 51 insertions, 30 deletions
diff --git a/bg/ResponseMetaData.js b/bg/ResponseMetaData.js index 41d1fe9..0f768fe 100644 --- a/bg/ResponseMetaData.js +++ b/bg/ResponseMetaData.js @@ -25,6 +25,9 @@ to parse textual data through a decoder. */ +const BOM = [0xEF, 0xBB, 0xBF]; +const DECODER_PARAMS = {stream: true}; + class ResponseMetaData { constructor(request) { let {responseHeaders} = request; @@ -37,7 +40,7 @@ class ResponseMetaData { this.headers[propertyName] = h; } } - this.forcedUTF8 = false; + this.computedCharset = ""; } get charset() { @@ -49,34 +52,54 @@ class ResponseMetaData { } } Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true }); - return charset; + return this.computedCharset = charset; } - get isUTF8() { - return /^utf-?8$/i.test(this.charset); - } + decode(data) { + let charset = this.charset; + let decoder = this.createDecoder(); + let text = decoder.decode(data, DECODER_PARAMS); + if (!charset && /html/i.test(this.contentType)) { + // missing HTTP charset, sniffing in content... - forceUTF8() { - if (!(this.forcedUTF8 || this.isUTF8)) { - let h = this.headers.contentType; - if (h) { - h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8"); - this.forcedUTF8 = true; - } // if the header doesn't exist the browser should default to UTF-8 anyway + if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) { + // forced UTF-8, nothing to do + return text; + } + + // let's try figuring out the charset from <meta> tags + let parser = new DOMParser(); + let doc = parser.parseFromString(text, "text/html"); + let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]'); + for (let m of meta) { + charset = m.getAttribute("charset"); + if (!charset) { + let match = m.getAttribute("content").match(/;\s*charset\s*=\s*([\w-]+)/i) + if (match) charset = match[1]; + } + if (charset) { + decoder = this.createDecoder(charset, null); + if (decoder) { + this.computedCharset = charset; + return decoder.decode(data, DECODER_PARAMS); + } + } + } } - return this.forcedUTF8; + return text; } - createDecoder() { - if (this.charset) { + createDecoder(charset = this.charset, def = "latin1") { + if (charset) { try { - return new TextDecoder(this.charset); + return new TextDecoder(charset); } catch (e) { console.error(e); } } - return new TextDecoder("utf-8"); + return def ? new TextDecoder(def) : null; } }; +ResponseMetaData.UTF8BOM = new Uint8Array(BOM); module.exports = { ResponseMetaData }; diff --git a/bg/ResponseProcessor.js b/bg/ResponseProcessor.js index 4443d90..1aa89de 100644 --- a/bg/ResponseProcessor.js +++ b/bg/ResponseProcessor.js @@ -90,8 +90,6 @@ class ResponseTextFilter { }; filter.onstop = async event => { - - let params = {stream: true}; // concatenate chunks let size = buffer.reduce((sum, chunk, n) => sum + chunk.byteLength, 0) let allBytes = new Uint8Array(size); @@ -108,10 +106,10 @@ class ResponseTextFilter { response.text = await (await fetch(request.url, {cache: "reload", credentials: "include"})).text(); } else { console.debug("It's a %s, trying to decode it as UTF-16.", request.type); - response.text = new TextDecoder("utf-16be").decode(allBytes); + response.text = new TextDecoder("utf-16be").decode(allBytes, {stream: true}); } } else { - response.text = metaData.createDecoder().decode(allBytes, {stream: true}); + response.text = metaData.decode(allBytes); } let editedText = null; try { @@ -119,19 +117,19 @@ class ResponseTextFilter { } catch(e) { console.error(e); } - if (editedText !== null && - (metaData.forcedUTF8 && request.type !== "script" || - response.text !== editedText)) { - // if we changed the charset, the text or both, let's re-encode - filter.write(new TextEncoder().encode(editedText)); - } else { - // ... otherwise pass all the raw bytes through - filter.write(allBytes); + if (editedText !== null) { + // we changed the content, let's re-encode + let encoded = new TextEncoder().encode(editedText); + // pre-pending the UTF-8 BOM will force the charset per HTML 5 specs + allBytes = new Uint8Array(encoded.byteLength + 3); + allBytes.set(ResponseMetaData.UTF8BOM, 0); // UTF-8 BOM + allBytes.set(encoded, 3); } + filter.write(allBytes); filter.close(); } - return metaData.forceUTF8() ? {responseHeaders} : ResponseProcessor.ACCEPT;; + return ResponseProcessor.ACCEPT; } } |