aboutsummaryrefslogtreecommitdiff
path: root/bg/ResponseMetaData.js
diff options
context:
space:
mode:
authorhackademix <giorgio@maone.net>2019-03-13 01:06:18 +0100
committerhackademix <giorgio@maone.net>2019-03-13 01:06:18 +0100
commit50d4b4ba0207b7fd7ece2f1797ff87795e5bc064 (patch)
tree8577566cd578b0faaf07c0e1007666f6441f2e6a /bg/ResponseMetaData.js
parent2a272e97c076c8a0f13a1f0f5140ad5d72cbd890 (diff)
Sniffing of charset in <meta> tags and BOM.
Diffstat (limited to 'bg/ResponseMetaData.js')
-rw-r--r--bg/ResponseMetaData.js49
1 files changed, 44 insertions, 5 deletions
diff --git a/bg/ResponseMetaData.js b/bg/ResponseMetaData.js
index 5a75a3c..0f768fe 100644
--- a/bg/ResponseMetaData.js
+++ b/bg/ResponseMetaData.js
@@ -25,6 +25,9 @@
to parse textual data through a decoder.
*/
+const BOM = [0xEF, 0xBB, 0xBF];
+const DECODER_PARAMS = {stream: true};
+
class ResponseMetaData {
constructor(request) {
let {responseHeaders} = request;
@@ -37,6 +40,7 @@ class ResponseMetaData {
this.headers[propertyName] = h;
}
}
+ this.computedCharset = "";
}
get charset() {
@@ -48,19 +52,54 @@ class ResponseMetaData {
}
}
Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });
- return charset;
+ return this.computedCharset = charset;
+ }
+
+ decode(data) {
+ let charset = this.charset;
+ let decoder = this.createDecoder();
+ let text = decoder.decode(data, DECODER_PARAMS);
+ if (!charset && /html/i.test(this.contentType)) {
+ // missing HTTP charset, sniffing in content...
+
+ if (data[0] === BOM[0] && data[1] === BOM[1] && data[2] === BOM[2]) {
+ // forced UTF-8, nothing to do
+ return text;
+ }
+
+ // let's try figuring out the charset from <meta> tags
+ let parser = new DOMParser();
+ let doc = parser.parseFromString(text, "text/html");
+ let meta = doc.querySelectorAll('meta[charset], meta[http-equiv="content-type"], meta[content*="charset"]');
+ for (let m of meta) {
+ charset = m.getAttribute("charset");
+ if (!charset) {
+ let match = m.getAttribute("content").match(/;\s*charset\s*=\s*([\w-]+)/i)
+ if (match) charset = match[1];
+ }
+ if (charset) {
+ decoder = this.createDecoder(charset, null);
+ if (decoder) {
+ this.computedCharset = charset;
+ return decoder.decode(data, DECODER_PARAMS);
+ }
+ }
+ }
+ }
+ return text;
}
- createDecoder() {
- if (this.charset) {
+ createDecoder(charset = this.charset, def = "latin1") {
+ if (charset) {
try {
- return new TextDecoder(this.charset);
+ return new TextDecoder(charset);
} catch (e) {
console.error(e);
}
}
- return new TextDecoder("latin1");
+ return def ? new TextDecoder(def) : null;
}
};
+ResponseMetaData.UTF8BOM = new Uint8Array(BOM);
module.exports = { ResponseMetaData };