aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhackademix <giorgio@maone.net>2018-07-15 23:15:34 +0200
committerhackademix <giorgio@maone.net>2018-07-17 17:45:27 +0200
commitdbe1ddbc1ddd9b40c6e0e5c896768f91cb6994e0 (patch)
treec3eb5a2b9bdbf59901d9da2602a0635ea3cc00c7
parenteb929a0aff3a23a53f750ab3c866fbec57107fbb (diff)
Refactor and fix HTTP response filtering to touch only scripts, either external or embedded inside HTML documents and sub-documents.
-rw-r--r--bg/ResponseMetaData.js82
-rw-r--r--bg/ResponseProcessor.js110
-rw-r--r--main_background.js142
3 files changed, 234 insertions, 100 deletions
diff --git a/bg/ResponseMetaData.js b/bg/ResponseMetaData.js
new file mode 100644
index 0000000..40ca3f3
--- /dev/null
+++ b/bg/ResponseMetaData.js
@@ -0,0 +1,82 @@
+/**
+* GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript.
+*
+* Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
+*
+* This file is part of GNU LibreJS.
+*
+* GNU LibreJS is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* GNU LibreJS is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with GNU LibreJS. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ This class parses HTTP response headers to extract both the
+ MIME Content-type and the character set to be used, if specified,
+ to parse textual data through a decoder.
+*/
+
+class ResponseMetaData {
+ constructor(request) {
+ let {responseHeaders} = request;
+ this.headers = {};
+ for (let h of responseHeaders) {
+ if (/^\s*Content-(Type|Disposition)\s*$/i.test(h.name)) {
+ let propertyName = h.name.split("-")[1].trim();
+ propertyName = `content${propertyName.charAt(0).toUpperCase()}${propertyName.substring(1).toLowerCase()}`;
+ this[propertyName] = h.value;
+ this.headers[propertyName] = h;
+ }
+ }
+ this.forcedUTF8 = false;
+ }
+
+ get charset() {
+ let charset = "";
+ if (this.contentType) {
+ let m = this.contentType.match(/;\s*charset\s*=\s*(\S+)/);
+ if (m) {
+ charset = m[1];
+ }
+ }
+ Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true });
+ return charset;
+ }
+
+ get isUTF8() {
+ return /^utf-8$/i.test(this.charset);
+ }
+
+ forceUTF8() {
+ if (!(this.forcedUTF8 || this.isUTF8)) {
+ let h = this.headers.contentType;
+ if (h) {
+ h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8");
+ this.forcedUTF8 = true;
+ } // if the header doesn't exist the browser should default to UTF-8 anyway
+ }
+ return this.forcedUTF8;
+ }
+
+ createDecoder() {
+ if (this.charset) {
+ try {
+ return new TextDecoder(this.charset);
+ } catch (e) {
+ console.error(e);
+ }
+ }
+ return new TextDecoder("utf-8");
+ }
+};
+
+module.exports = { ResponseMetaData };
diff --git a/bg/ResponseProcessor.js b/bg/ResponseProcessor.js
new file mode 100644
index 0000000..3f3151b
--- /dev/null
+++ b/bg/ResponseProcessor.js
@@ -0,0 +1,110 @@
+/**
+* GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript.
+*
+* Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
+*
+* This file is part of GNU LibreJS.
+*
+* GNU LibreJS is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* GNU LibreJS is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with GNU LibreJS. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ An abstraction layer over the StreamFilter API, allowing its clients to process
+ only the "interesting" HTML and script requests and leaving the other alone
+*/
+
+let {ResponseMetaData} = require("./ResponseMetaData.js");
+
+let listeners = new WeakMap();
+let webRequestEvent = browser.webRequest.onHeadersReceived;
+
+class ResponseProcessor {
+
+ static install(handler, types = ["main_frame", "sub_frame", "script"]) {
+ if (listeners.has(handler)) return false;
+ let listener =
+ request => new ResponseTextFilter(request).process(handler);
+ listeners.set(handler, listener);
+ webRequestEvent.addListener(
+ listener,
+ {urls: ["<all_urls>"], types},
+ ["blocking", "responseHeaders"]
+ );
+ return true;
+ }
+
+ static uninstall(handler) {
+ let listener = listeners.get(handler);
+ if (listener) {
+ webRequestEvent.removeListener(listener);
+ }
+ }
+}
+
+class ResponseTextFilter {
+ constructor(request) {
+ this.request = request;
+ let {type, statusCode} = request;
+ let md = this.metaData = new ResponseMetaData(request);
+ this.canProcess = // we want to process html documents and scripts only
+ (statusCode < 300 || statusCode >= 400) && // skip redirections
+ !md.disposition && // skip forced downloads
+ (type === "script" || /\bhtml\b/i.test(md.contentType));
+ }
+
+ process(handler) {
+ if (!this.canProcess) return {};
+ let metaData = this.metaData;
+ let {requestId, responseHeaders} = this.request;
+ let filter = browser.webRequest.filterResponseData(requestId);
+ let buffer = [];
+
+ filter.ondata = event => {
+ buffer.push(event.data);
+ };
+
+ filter.onstop = async event => {
+ let decoder = metaData.createDecoder();
+ let params = {stream: true};
+ let text = this.text = buffer.map(
+ chunk => decoder.decode(chunk, params))
+ .join('');
+ let editedText = null;
+ try {
+ let response = {
+ request: this.request,
+ metaData,
+ text,
+ };
+ editedText = await handler(response);
+ } catch(e) {
+ console.error(e);
+ }
+ if (metaData.forcedUTF8 ||
+ editedText !== null && text !== editedText) {
+ // if we changed the charset, the text or both, let's re-encode
+ filter.write(new TextEncoder().encode(editedText));
+ } else {
+ // ... otherwise pass all the raw bytes through
+ for (let chunk of buffer) filter.write(chunk);
+ }
+
+ filter.disconnect();
+ }
+
+ return metaData.forceUTF8() ? {responseHeaders} : {};
+ }
+}
+
+module.exports = { ResponseProcessor };
diff --git a/main_background.js b/main_background.js
index debe0c2..ff68479 100644
--- a/main_background.js
+++ b/main_background.js
@@ -25,6 +25,7 @@ var acorn = require('acorn/dist/acorn_loose');
var jssha = require('jssha');
var walk = require("acorn/dist/walk");
var legacy_license_lib = require("./legacy_license_check.js");
+var {ResponseProcessor} = require("./bg/ResponseProcessor");
console.log("main_background.js");
/**
@@ -853,7 +854,7 @@ function license_valid(matches){
* reason text
* ]
*/
-function license_read(script_src, name){
+function license_read(script_src, name, external = false){
var reason_text = "";
@@ -970,7 +971,7 @@ function get_script(response,url,tabid,wl,index=-1){
}
edited = [true,response,"Page is whitelisted in preferences"];
}else{
- edited = license_read(response,scriptname);
+ edited = license_read(response,scriptname,index == -2);
}
var src_hash = hash(response);
var verdict = edited[0];
@@ -1066,35 +1067,28 @@ function block_ga(a){
else return {};
}
+
+
/**
-* This is a callback trigged from requests caused by script tags with the src="" attribute.
+* This listener gets called as soon as we've got all the HTTP headers, can guess
+* content type and encoding, and therefore correctly parse HTML documents and
+* and external script inclusion in search of non-free JavaScript
*/
-function read_script(a){
- var GA = test_GA(a);
- if(GA !== false){
- return GA;
- }
- var filter = webex.webRequest.filterResponseData(a.requestId);
- var decoder = new TextDecoder("utf-8");
- var encoder = new TextEncoder();
- var str = "";
-
- filter.onstop = event => {
- dbg_print("read_script "+a.url);
- var res = test_url_whitelisted(a.url);
- res.then(function(whitelisted){
- var edit_script = get_script(str,a.url,a["tabId"],whitelisted,-1);
- edit_script.then(function(edited){
- filter.write(encoder.encode(edited));
- filter.disconnect();
- });
- });
- }
- filter.ondata = event => {
- str += decoder.decode(event.data, {stream: true});
- }
- return {};
+async function responseHandler(response) {
+ let {url, type} = response.request;
+ let whitelisted = await test_url_whitelisted(url);
+ let handle_it = type === "script" ? handle_script : handle_html;
+ return await handle_it(response, whitelisted);
+}
+
+/**
+* Here we handle external script requests
+*/
+async function handle_script(response, whitelisted){
+ let {text, request} = response;
+ let {url, tabId} = request;
+ return await get_script(text, url, tabId, whitelisted, -2);
}
/**
@@ -1260,61 +1254,21 @@ function edit_html(html,url,tabid,wl){
}
/**
-* Callback for main frame requests
-*
+* Here we handle html document responses
*/
-function read_document(a){
- var GA = test_GA(a);
- if(GA != false){
- return GA;
- }
- var str = "";
- var filter = webex.webRequest.filterResponseData(a.requestId);
- var decoder = new TextDecoder("utf-8");
- var encoder = new TextEncoder();
- filter.onerror = event => {
- dbg_print("%c Error in getting document","color:red");
- }
- filter.onstop = event => {
- time = Date.now();
- delete unused_data[a["tabId"]];
- webex.browserAction.setBadgeText({
- text: "✓",
- tabId: a["tabId"]
- });
- webex.browserAction.setBadgeBackgroundColor({
- color: "green",
- tabId: a["tabId"]
- });
- var test = new ArrayBuffer();
- var res = test_url_whitelisted(a.url);
- res.then(function(whitelisted){
- var edit_page;
- // TODO Fix this ugly HACK!
- if(! str.includes("<html")){
- dbg_print("not html");
- filter.write(encoder.encode(str));
- filter.disconnect();
- return {};
- }
- if(whitelisted == true){
- dbg_print("WHITELISTED");
- // Doesn't matter if this is accepted or blocked, it will still be whitelisted
- filter.write(encoder.encode(str));
- filter.disconnect();
- } else{
- edit_page = edit_html(str,a.url,a["tabId"],false);
- edit_page.then(function(edited){
- filter.write(encoder.encode(edited));
- filter.disconnect();
- });
- }
- });
- }
- filter.ondata = event => {
- str += decoder.decode(event.data, {stream: true});
- }
- return {};
+async function handle_html(response, whitelisted) {
+ let {text, request} = response;
+ let {url, tabId} = request;
+ delete unused_data[tabId];
+ browser.browserAction.setBadgeText({
+ text: "✓",
+ tabId
+ });
+ browser.browserAction.setBadgeBackgroundColor({
+ color: "green",
+ tabId
+ });
+ return await edit_html(text, url, tabId, false);
}
/**
@@ -1329,32 +1283,20 @@ function init_addon(){
webex.tabs.onRemoved.addListener(delete_removed_tab_info);
// Prevents Google Analytics from being loaded from Google servers
- var all_types = [
+ let all_types = [
"beacon", "csp_report", "font", "image", "imageset", "main_frame", "media",
"object", "object_subrequest", "ping", "script", "stylesheet", "sub_frame",
"web_manifest", "websocket", "xbl", "xml_dtd", "xmlhttprequest", "xslt",
"other"
- ]
- // Analyzes remote scripts
+ ];
webex.webRequest.onBeforeRequest.addListener(
block_ga,
- {urls:["<all_urls>"], types:all_types},
- ["blocking"]
- );
-
- // Analyzes remote scripts
- webex.webRequest.onBeforeRequest.addListener(
- read_script,
- {urls:["<all_urls>"], types:["script"]},
- ["blocking"]
- );
-
- // Analyzes the scripts inside of HTML
- webex.webRequest.onBeforeRequest.addListener(
- read_document,
- {urls:["<all_urls>"], types:["main_frame"]},
+ {urls: ["<all_urls>"], types: all_types},
["blocking"]
);
+
+ // Analyzes all the html documents and external scripts as they're loaded
+ ResponseProcessor.install(responseHandler);
legacy_license_lib.init();
}