aboutsummaryrefslogtreecommitdiff
path: root/bg
diff options
context:
space:
mode:
authorRuben Rodriguez <ruben@gnu.org>2019-03-13 21:51:01 +0000
committerRuben Rodriguez <ruben@gnu.org>2019-03-13 21:51:01 +0000
commit55469c349c3c47c882ee21348ba67780c8291003 (patch)
treea7aa8fa171eaa79747a2fe231006d9540458cb31 /bg
parent1e4ce528dec55b79501e15ef08b751b4f533d756 (diff)
parent50d4b4ba0207b7fd7ece2f1797ff87795e5bc064 (diff)
Merge #36 `Correctly decode using the original document charset and force re-encoding via UTF-8 BOM only when needed.`
Diffstat (limited to 'bg')
-rw-r--r--bg/ResponseMetaData.js57
-rw-r--r--bg/ResponseProcessor.js24
2 files changed, 51 insertions, 30 deletions
diff --git a/bg/ResponseMetaData.js b/bg/ResponseMetaData.js
index 41d1fe9..0f768fe 100644
--- a/bg/ResponseMetaData.js
+++ b/bg/ResponseMetaData.js
@@ -25,6 +25,9 @@
to parse textual data through a decoder.
*/
+const BOM = [0xEF, 0xBB, 0xBF];
+const DECODER_PARAMS = {stream: true};
+
class ResponseMetaData {
constructor(request) {
let {responseHeaders} = request;
@@ -37,7 +40,7 @@ class ResponseMetaData {
this.headers[propertyName] = h;
}
}
- this.forcedUTF8 = false;
+ this.computedCharset = "";
}
get charset() {
@@ -49,34 +52,54 @@ class ResponseMetaData {
}
}
Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });
- return charset;
+ return this.computedCharset = charset;
}
- get isUTF8() {
- return /^utf-?8$/i.test(this.charset);
- }
+ decode(data) {
+ let charset = this.charset;
+ let decoder = this.createDecoder();
+ let text = decoder.decode(data, DECODER_PARAMS);
+ if (!charset && /html/i.test(this.contentType)) {
+ // missing HTTP charset, sniffing in content...
- forceUTF8() {
- if (!(this.forcedUTF8 || this.isUTF8)) {
- let h = this.headers.contentType;
- if (h) {
- h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8");
- this.forcedUTF8 = true;
- } // if the header doesn't exist the browser should default to UTF-8 anyway
+ if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) {
+ // forced UTF-8, nothing to do
+ return text;
+ }
+
+ // let's try figuring out the charset from <meta> tags
+ let parser = new DOMParser();
+ let doc = parser.parseFromString(text, "text/html");
+ let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]');
+ for (let m of meta) {
+ charset = m.getAttribute("charset");
+ if (!charset) {
+ let match = m.getAttribute("content").match(/;\s*charset\s*=\s*([\w-]+)/i)
+ if (match) charset = match[1];
+ }
+ if (charset) {
+ decoder = this.createDecoder(charset, null);
+ if (decoder) {
+ this.computedCharset = charset;
+ return decoder.decode(data, DECODER_PARAMS);
+ }
+ }
+ }
}
- return this.forcedUTF8;
+ return text;
}
- createDecoder() {
- if (this.charset) {
+ createDecoder(charset = this.charset, def = "latin1") {
+ if (charset) {
try {
- return new TextDecoder(this.charset);
+ return new TextDecoder(charset);
} catch (e) {
console.error(e);
}
}
- return new TextDecoder("utf-8");
+ return def ? new TextDecoder(def) : null;
}
};
+ResponseMetaData.UTF8BOM = new Uint8Array(BOM);
module.exports = { ResponseMetaData };
diff --git a/bg/ResponseProcessor.js b/bg/ResponseProcessor.js
index 4443d90..1aa89de 100644
--- a/bg/ResponseProcessor.js
+++ b/bg/ResponseProcessor.js
@@ -90,8 +90,6 @@ class ResponseTextFilter {
};
filter.onstop = async event => {
-
- let params = {stream: true};
// concatenate chunks
let size = buffer.reduce((sum, chunk, n) => sum + chunk.byteLength, 0)
let allBytes = new Uint8Array(size);
@@ -108,10 +106,10 @@ class ResponseTextFilter {
response.text = await (await fetch(request.url, {cache: "reload", credentials: "include"})).text();
} else {
console.debug("It's a %s, trying to decode it as UTF-16.", request.type);
- response.text = new TextDecoder("utf-16be").decode(allBytes);
+ response.text = new TextDecoder("utf-16be").decode(allBytes, {stream: true});
}
} else {
- response.text = metaData.createDecoder().decode(allBytes, {stream: true});
+ response.text = metaData.decode(allBytes);
}
let editedText = null;
try {
@@ -119,19 +117,19 @@ class ResponseTextFilter {
} catch(e) {
console.error(e);
}
- if (editedText !== null &&
- (metaData.forcedUTF8 && request.type !== "script" ||
- response.text !== editedText)) {
- // if we changed the charset, the text or both, let's re-encode
- filter.write(new TextEncoder().encode(editedText));
- } else {
- // ... otherwise pass all the raw bytes through
- filter.write(allBytes);
+ if (editedText !== null) {
+ // we changed the content, let's re-encode
+ let encoded = new TextEncoder().encode(editedText);
+ // pre-pending the UTF-8 BOM will force the charset per HTML 5 specs
+ allBytes = new Uint8Array(encoded.byteLength + 3);
+ allBytes.set(ResponseMetaData.UTF8BOM, 0); // UTF-8 BOM
+ allBytes.set(encoded, 3);
}
+ filter.write(allBytes);
filter.close();
}
- return metaData.forceUTF8() ? {responseHeaders} : ResponseProcessor.ACCEPT;;
+ return ResponseProcessor.ACCEPT;
}
}