From dbe1ddbc1ddd9b40c6e0e5c896768f91cb6994e0 Mon Sep 17 00:00:00 2001 From: hackademix Date: Sun, 15 Jul 2018 23:15:34 +0200 Subject: Refactor and fix HTTP response filtering to touch only scripts, either external or embedded inside HTML documents and sub-documents. --- bg/ResponseMetaData.js | 82 ++++++++++++++++++++++++++++ bg/ResponseProcessor.js | 110 +++++++++++++++++++++++++++++++++++++ main_background.js | 142 ++++++++++++++---------------------------------- 3 files changed, 234 insertions(+), 100 deletions(-) create mode 100644 bg/ResponseMetaData.js create mode 100644 bg/ResponseProcessor.js diff --git a/bg/ResponseMetaData.js b/bg/ResponseMetaData.js new file mode 100644 index 0000000..40ca3f3 --- /dev/null +++ b/bg/ResponseMetaData.js @@ -0,0 +1,82 @@ +/** +* GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript. +* +* Copyright (C) 2018 Giorgio Maone +* +* This file is part of GNU LibreJS. +* +* GNU LibreJS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* GNU LibreJS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with GNU LibreJS. If not, see . +*/ + +/** + This class parses HTTP response headers to extract both the + MIME Content-type and the character set to be used, if specified, + to parse textual data through a decoder. +*/ + +class ResponseMetaData { + constructor(request) { + let {responseHeaders} = request; + this.headers = {}; + for (let h of responseHeaders) { + if (/^\s*Content-(Type|Disposition)\s*$/i.test(h.name)) { + let propertyName = h.name.split("-")[1].trim(); + propertyName = `content${propertyName.charAt(0).toUpperCase()}${propertyName.substring(1).toLowerCase()}`; + this[propertyName] = h.value; + this.headers[propertyName] = h; + } + } + this.forcedUTF8 = false; + } + + get charset() { + let charset = ""; + if (this.contentType) { + let m = this.contentType.match(/;\s*charset\s*=\s*(\S+)/); + if (m) { + charset = m[1]; + } + } + Object.defineProperty(this, "charset", { value: charset, writable: false, configurable: true }); + return charset; + } + + get isUTF8() { + return /^utf-8$/i.test(this.charset); + } + + forceUTF8() { + if (!(this.forcedUTF8 || this.isUTF8)) { + let h = this.headers.contentType; + if (h) { + h.value = h.value.replace(/;\s*charset\s*=.*|$/, "; charset=utf8"); + this.forcedUTF8 = true; + } // if the header doesn't exist the browser should default to UTF-8 anyway + } + return this.forcedUTF8; + } + + createDecoder() { + if (this.charset) { + try { + return new TextDecoder(this.charset); + } catch (e) { + console.error(e); + } + } + return new TextDecoder("utf-8"); + } +}; + +module.exports = { ResponseMetaData }; diff --git a/bg/ResponseProcessor.js b/bg/ResponseProcessor.js new file mode 100644 index 0000000..3f3151b --- /dev/null +++ b/bg/ResponseProcessor.js @@ -0,0 +1,110 @@ +/** +* GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript. +* +* Copyright (C) 2018 Giorgio Maone +* +* This file is part of GNU LibreJS. +* +* GNU LibreJS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* GNU LibreJS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with GNU LibreJS. If not, see . +*/ + +/** + An abstraction layer over the StreamFilter API, allowing its clients to process + only the "interesting" HTML and script requests and leaving the other alone +*/ + +let {ResponseMetaData} = require("./ResponseMetaData.js"); + +let listeners = new WeakMap(); +let webRequestEvent = browser.webRequest.onHeadersReceived; + +class ResponseProcessor { + + static install(handler, types = ["main_frame", "sub_frame", "script"]) { + if (listeners.has(handler)) return false; + let listener = + request => new ResponseTextFilter(request).process(handler); + listeners.set(handler, listener); + webRequestEvent.addListener( + listener, + {urls: [""], types}, + ["blocking", "responseHeaders"] + ); + return true; + } + + static uninstall(handler) { + let listener = listeners.get(handler); + if (listener) { + webRequestEvent.removeListener(listener); + } + } +} + +class ResponseTextFilter { + constructor(request) { + this.request = request; + let {type, statusCode} = request; + let md = this.metaData = new ResponseMetaData(request); + this.canProcess = // we want to process html documents and scripts only + (statusCode < 300 || statusCode >= 400) && // skip redirections + !md.disposition && // skip forced downloads + (type === "script" || /\bhtml\b/i.test(md.contentType)); + } + + process(handler) { + if (!this.canProcess) return {}; + let metaData = this.metaData; + let {requestId, responseHeaders} = this.request; + let filter = browser.webRequest.filterResponseData(requestId); + let buffer = []; + + filter.ondata = event => { + buffer.push(event.data); + }; + + filter.onstop = async event => { + let decoder = metaData.createDecoder(); + let params = {stream: true}; + let text = this.text = buffer.map( + chunk => decoder.decode(chunk, params)) + .join(''); + let editedText = null; + try { + let response = { + request: this.request, + metaData, + text, + }; + editedText = await handler(response); + } catch(e) { + console.error(e); + } + if (metaData.forcedUTF8 || + editedText !== null && text !== editedText) { + // if we changed the charset, the text or both, let's re-encode + filter.write(new TextEncoder().encode(editedText)); + } else { + // ... otherwise pass all the raw bytes through + for (let chunk of buffer) filter.write(chunk); + } + + filter.disconnect(); + } + + return metaData.forceUTF8() ? {responseHeaders} : {}; + } +} + +module.exports = { ResponseProcessor }; diff --git a/main_background.js b/main_background.js index debe0c2..ff68479 100644 --- a/main_background.js +++ b/main_background.js @@ -25,6 +25,7 @@ var acorn = require('acorn/dist/acorn_loose'); var jssha = require('jssha'); var walk = require("acorn/dist/walk"); var legacy_license_lib = require("./legacy_license_check.js"); +var {ResponseProcessor} = require("./bg/ResponseProcessor"); console.log("main_background.js"); /** @@ -853,7 +854,7 @@ function license_valid(matches){ * reason text * ] */ -function license_read(script_src, name){ +function license_read(script_src, name, external = false){ var reason_text = ""; @@ -970,7 +971,7 @@ function get_script(response,url,tabid,wl,index=-1){ } edited = [true,response,"Page is whitelisted in preferences"]; }else{ - edited = license_read(response,scriptname); + edited = license_read(response,scriptname,index == -2); } var src_hash = hash(response); var verdict = edited[0]; @@ -1066,35 +1067,28 @@ function block_ga(a){ else return {}; } + + /** -* This is a callback trigged from requests caused by script tags with the src="" attribute. +* This listener gets called as soon as we've got all the HTTP headers, can guess +* content type and encoding, and therefore correctly parse HTML documents and +* and external script inclusion in search of non-free JavaScript */ -function read_script(a){ - var GA = test_GA(a); - if(GA !== false){ - return GA; - } - var filter = webex.webRequest.filterResponseData(a.requestId); - var decoder = new TextDecoder("utf-8"); - var encoder = new TextEncoder(); - var str = ""; - - filter.onstop = event => { - dbg_print("read_script "+a.url); - var res = test_url_whitelisted(a.url); - res.then(function(whitelisted){ - var edit_script = get_script(str,a.url,a["tabId"],whitelisted,-1); - edit_script.then(function(edited){ - filter.write(encoder.encode(edited)); - filter.disconnect(); - }); - }); - } - filter.ondata = event => { - str += decoder.decode(event.data, {stream: true}); - } - return {}; +async function responseHandler(response) { + let {url, type} = response.request; + let whitelisted = await test_url_whitelisted(url); + let handle_it = type === "script" ? handle_script : handle_html; + return await handle_it(response, whitelisted); +} + +/** +* Here we handle external script requests +*/ +async function handle_script(response, whitelisted){ + let {text, request} = response; + let {url, tabId} = request; + return await get_script(text, url, tabId, whitelisted, -2); } /** @@ -1260,61 +1254,21 @@ function edit_html(html,url,tabid,wl){ } /** -* Callback for main frame requests -* +* Here we handle html document responses */ -function read_document(a){ - var GA = test_GA(a); - if(GA != false){ - return GA; - } - var str = ""; - var filter = webex.webRequest.filterResponseData(a.requestId); - var decoder = new TextDecoder("utf-8"); - var encoder = new TextEncoder(); - filter.onerror = event => { - dbg_print("%c Error in getting document","color:red"); - } - filter.onstop = event => { - time = Date.now(); - delete unused_data[a["tabId"]]; - webex.browserAction.setBadgeText({ - text: "✓", - tabId: a["tabId"] - }); - webex.browserAction.setBadgeBackgroundColor({ - color: "green", - tabId: a["tabId"] - }); - var test = new ArrayBuffer(); - var res = test_url_whitelisted(a.url); - res.then(function(whitelisted){ - var edit_page; - // TODO Fix this ugly HACK! - if(! str.includes(" { - str += decoder.decode(event.data, {stream: true}); - } - return {}; +async function handle_html(response, whitelisted) { + let {text, request} = response; + let {url, tabId} = request; + delete unused_data[tabId]; + browser.browserAction.setBadgeText({ + text: "✓", + tabId + }); + browser.browserAction.setBadgeBackgroundColor({ + color: "green", + tabId + }); + return await edit_html(text, url, tabId, false); } /** @@ -1329,32 +1283,20 @@ function init_addon(){ webex.tabs.onRemoved.addListener(delete_removed_tab_info); // Prevents Google Analytics from being loaded from Google servers - var all_types = [ + let all_types = [ "beacon", "csp_report", "font", "image", "imageset", "main_frame", "media", "object", "object_subrequest", "ping", "script", "stylesheet", "sub_frame", "web_manifest", "websocket", "xbl", "xml_dtd", "xmlhttprequest", "xslt", "other" - ] - // Analyzes remote scripts + ]; webex.webRequest.onBeforeRequest.addListener( block_ga, - {urls:[""], types:all_types}, - ["blocking"] - ); - - // Analyzes remote scripts - webex.webRequest.onBeforeRequest.addListener( - read_script, - {urls:[""], types:["script"]}, - ["blocking"] - ); - - // Analyzes the scripts inside of HTML - webex.webRequest.onBeforeRequest.addListener( - read_document, - {urls:[""], types:["main_frame"]}, + {urls: [""], types: all_types}, ["blocking"] ); + + // Analyzes all the html documents and external scripts as they're loaded + ResponseProcessor.install(responseHandler); legacy_license_lib.init(); } -- cgit v1.2.3