From d78d3dae30a4ac662ac5d35f1e7f245ce5825f40 Mon Sep 17 00:00:00 2001 From: Yuchen Pei Date: Thu, 22 Sep 2022 15:36:04 +1000 Subject: Separating out script and license checking routines to common/checks --- common/checks.js | 359 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- common/debug.js | 37 ++++++ 2 files changed, 389 insertions(+), 7 deletions(-) create mode 100644 common/debug.js (limited to 'common') diff --git a/common/checks.js b/common/checks.js index f3c27bc..e17fc02 100644 --- a/common/checks.js +++ b/common/checks.js @@ -20,18 +20,61 @@ * along with GNU LibreJS. If not, see . */ +const acorn = require('acorn'); const { licenses } = require('./license_definitions.js'); const { patternUtils } = require('./pattern_utils.js'); +const { makeDebugLogger } = require('./common/debug.js'); +const fnameData = require('../fname_data.json').fname_data; const LIC_RE = /@licstartThefollowingistheentirelicensenoticefortheJavaScriptcodeinthis(?:page|file)(.*)?@licendTheaboveistheentirelicensenoticefortheJavaScriptcodeinthis(?:page|file)/mi; +/* + NONTRIVIAL THINGS: + - Fetch + - XMLhttpRequest + - eval() + - ? + JAVASCRIPT CAN BE FOUND IN: + - Event handlers (onclick, onload, onsubmit, etc.) + - + - + WAYS TO DETERMINE PASS/FAIL: + - "// @license [magnet link] [identifier]" then "// @license-end" (may also use /* comments) + - Automatic whitelist: (http://bzr.savannah.gnu.org/lh/librejs/dev/annotate/head:/data/script_libraries/script-libraries.json_ +*/ +// These are objects that it will search for in an initial regex pass over non-free scripts. +const RESERVED_OBJECTS = [ + //"document", + //"window", + 'fetch', + 'XMLHttpRequest', + 'chrome', // only on chrome + 'browser', // only on firefox + 'eval' +]; +const LOOPKEYS = new Set(['for', 'if', 'while', 'switch']); +const OPERATORS = new Set(['||', '&&', '=', '==', '++', '--', '+=', '-=', '*']); +// @license match, second and third capture groups are canonicalUrl +// and license name +const OPENING_LICENSE_RE = /\/[/*]\s*?(@license)\s+(\S+)\s+(\S+).*$/mi; +const CLOSING_LICENSE_RE = /\/([*/])\s*@license-end\s*(\*\/)?/mi; +/** +* If this is true, it evaluates entire scripts instead of returning as soon as it encounters a violation. +* +* Also, it controls whether or not this part of the code logs to the console. +* +*/ +const DEBUG = false; // debug the JS evaluation +const PRINT_DEBUG = false; +const dbg_print = makeDebugLogger('checks.js', PRINT_DEBUG, Date.now()); /** * stripLicenseToRegexp * * Removes all non-alphanumeric characters except for the * special tokens, and replace the text values that are - * hardcoded in license_definitions.js + * hardcoded in license_definitions.js. Puts the result in + * the regex field of the fragments. * */ const stripLicenseToRegexp = function(license) { @@ -49,12 +92,10 @@ const init = function() { } } -module.exports.init = init; - /** * * Takes in the declaration that has been preprocessed and -* tests it against regexes in our table. +* tests it against regexes in licenses. */ const searchTable = function(strippedComment) { const stripped = patternUtils.removeNonalpha(strippedComment); @@ -73,10 +114,10 @@ const searchTable = function(strippedComment) { /** * Checks whether licenseText, modulo whitespace, starts with - * a @licstart / @licend with a free license, returns the license name + * a @licstart .. @licend with a free license, returns the license name * if so, and null otherwise. */ -const check = function(licenseText) { +const checkLicenseText = function(licenseText) { if (licenseText === undefined || licenseText === null) { return null; } @@ -87,4 +128,308 @@ const check = function(licenseText) { return matches && searchTable(matches[0]); }; -module.exports.check = check; +//************************this part can be tested in the HTML file index.html's script test.js**************************** + +/** + * Checks whether script is trivial by analysing its tokens. + * + * Returns an array of + * [flag (boolean, true if trivial), reason (string, human readable report)]. + */ +function fullEvaluate(script) { + if (script === undefined || script == '') { + return [true, 'Harmless null script']; + } + + let tokens; + + try { + tokens = acorn.tokenizer(script); + } catch (e) { + console.warn('Tokenizer could not be initiated (probably invalid code)'); + return [false, 'Tokenizer could not be initiated (probably invalid code)']; + } + try { + var toke = tokens.getToken(); + } catch (e) { + console.log(script); + console.log(e); + console.warn('couldn\'t get first token (probably invalid code)'); + console.warn('Continuing evaluation'); + } + + let amtloops = 0; + let definesFunctions = false; + + /** + * Given the end of an identifer token, it tests for parentheses + */ + function is_bsn(end) { + let i = 0; + while (script.charAt(end + i).match(/\s/g) !== null) { + i++; + if (i >= script.length - 1) { + return false; + } + } + return script.charAt(end + i) == '['; + } + + function evaluateByTokenValue(toke) { + const value = toke.value; + if (OPERATORS.has(value)) { + // It's just an operator. Javascript doesn't have operator overloading so it must be some + // kind of primitive (I.e. a number) + } else { + const status = fnameData[value]; + if (status === true) { // is the identifier banned? + dbg_print('%c NONTRIVIAL: nontrivial token: \'' + value + '\'', 'color:red'); + if (DEBUG == false) { + return [false, 'NONTRIVIAL: nontrivial token: \'' + value + '\'']; + } + } else if (status === false || status === undefined) {// is the identifier not banned or user defined? + // Is there bracket suffix notation? + if (is_bsn(toke.end)) { + dbg_print('%c NONTRIVIAL: Bracket suffix notation on variable \'' + value + '\'', 'color:red'); + if (DEBUG == false) { + return [false, '%c NONTRIVIAL: Bracket suffix notation on variable \'' + value + '\'']; + } + } + } else { + dbg_print('trivial token:' + value); + } + } + return [true, '']; + } + + function evaluateByTokenTypeKeyword(keyword) { + if (toke.type.keyword == 'function') { + dbg_print('%c NOTICE: Function declaration.', 'color:green'); + definesFunctions = true; + } + + if (LOOPKEYS.has(keyword)) { + amtloops++; + if (amtloops > 3) { + dbg_print('%c NONTRIVIAL: Too many loops/conditionals.', 'color:red'); + if (DEBUG == false) { + return [false, 'NONTRIVIAL: Too many loops/conditionals.']; + } + } + } + return [true, '']; + } + + while (toke !== undefined && toke.type != acorn.tokTypes.eof) { + if (toke.type.keyword !== undefined) { + //dbg_print("Keyword:"); + //dbg_print(toke); + + // This type of loop detection ignores functional loop alternatives and ternary operators + const tokeTypeRes = evaluateByTokenTypeKeyword(toke.type.keyword); + if (tokeTypeRes[0] === false) { + return tokeTypeRes; + } + } else if (toke.value !== undefined) { + const tokeValRes = evaluateByTokenValue(toke); + if (tokeValRes[0] === false) { + return tokeValRes; + } + } + // If not a keyword or an identifier it's some kind of operator, field parenthesis, brackets + try { + toke = tokens.getToken(); + } catch (e) { + dbg_print('Denied script because it cannot be parsed.'); + return [false, 'NONTRIVIAL: Cannot be parsed. This could mean it is a 404 error.']; + } + } + + dbg_print('%cAppears to be trivial.', 'color:green;'); + if (definesFunctions === true) + return [true, 'Script appears to be trivial but defines functions.']; + else + return [true, 'Script appears to be trivial.']; +} + + +//**************************************************************************************************** +/** +* This is the entry point for full code evaluation for triviality. +* +* Performs the initial pass on code to see if it needs to be completely parsed +* +* This can only determine if a script is bad, not if it's good +* +* If it passes the intitial pass, it runs the full pass and returns the result + +* It returns an array of [flag (boolean, false if "bad"), reason (string, human readable report)] +* +*/ +function evaluate(script, name) { + const reservedResult = evaluateForReservedObj(script, name); + if (reservedResult[0] === true) { + dbg_print('%c pass', 'color:green;'); + } else { + return reservedResult; + } + + return fullEvaluate(script); +} + +function evaluateForReservedObj(script, name) { + function reservedObjectRegex(object) { + const arithOperators = '\\+\\-\\*\\/\\%\\='; + return new RegExp('(?:[^\\w\\d]|^|(?:' + arithOperators + '))' + object + '(?:\\s*?(?:[\\;\\,\\.\\(\\[])\\s*?)', 'g'); + } + const mlComment = /\/\*([\s\S]+?)\*\//g; + const ilComment = /\/\/.+/gm; + const temp = script.replace(/'.+?'+/gm, '\'string\'').replace(/".+?"+/gm, '"string"').replace(mlComment, '').replace(ilComment, ''); + dbg_print('%c ------evaluation results for ' + name + '------', 'color:white'); + dbg_print('Script accesses reserved objects?'); + + // This is where individual "passes" are made over the code + for (const reserved of RESERVED_OBJECTS) { + if (reservedObjectRegex(reserved).exec(temp) != null) { + dbg_print('%c fail', 'color:red;'); + return [false, 'Script uses a reserved object (' + reserved + ')']; + } + } + return [true, 'Reserved object not found.']; +} + +/** + * Checks whether url is the magnet link of a license. + * + * Returns the licenseName if so, otherwise returns null. If a key is + * supplied, checks for the license with the key only. + */ +function checkMagnet(url, key = null) { + const fixedUrl = url.replace(/&/g, '&'); + // Match by magnet link + const checkLicenseMagnet = license => { + for (const cUrl of license.canonicalUrl) { + if (cUrl.startsWith('magnet:') && fixedUrl === cUrl) { + return licenses[key].licenseName; + } + } + return null; + } + + if (key) { + try { + return checkLicenseMagnet(licenses[key]); + } catch (error) { + return null; + } + } else { + for (const key in licenses) { + const result = checkLicenseMagnet(licenses[key]); + if (result) return result; + } + return null; + } +} + + +/** + * + * Evaluates the content of a script for licenses and triviality + * scriptSrc: content of the script; name: script name; external: + * whether the script is external + * + * Returns + * [ + * true (accepted) or false (denied), + * edited content, + * reason text + * ] + */ +function checkScriptSource(scriptSrc, name, external = false) { + let inSrc = scriptSrc.trim(); + if (!inSrc) return [true, scriptSrc, 'Empty source.']; + + // Check for @licstart .. @licend method + const license = checkLicenseText(scriptSrc); + if (license) { + return [true, scriptSrc, `Licensed under: ${license}`]; + } + + let outSrc = ''; + let reason = ''; + let partsDenied = false; + let partsAccepted = false; + + function checkTriviality(s) { + if (!patternUtils.removeJsComments(s).trim()) { + return true; // empty, ignore it + } + const [trivial, message] = external ? + [false, 'External script with no known license'] + : evaluate(s, name); + if (trivial) { + partsAccepted = true; + outSrc += s; + } else { + partsDenied = true; + if (s.startsWith('javascript:')) + outSrc += `# LIBREJS BLOCKED: ${message}`; + else + outSrc += `/*\nLIBREJS BLOCKED: ${message}\n*/`; + } + reason += `\n${message}`; + } + + // Consume inSrc by checking licenses in all @license / @license-end + // blocks and triviality outside these blocks + while (inSrc) { + const openingMatch = OPENING_LICENSE_RE.exec(inSrc); + const openingIndex = openingMatch ? openingMatch.index : inSrc.length; + // checks the triviality of the code before the license tag, if any + checkTriviality(inSrc.substring(0, openingIndex)); + inSrc = inSrc.substring(openingIndex); + if (!inSrc) break; + + // checks the remaining part, that starts with an @license + const closureMatch = CLOSING_LICENSE_RE.exec(inSrc); + if (!closureMatch) { + const msg = 'ERROR: @license with no @license-end'; + return [false, `\n/*\n ${msg} \n*/\n`, msg]; + } + let closureEndIndex = closureMatch.index + closureMatch[0].length; + const commentEndOffset = inSrc.substring(closureEndIndex).indexOf(closureMatch[1] === '*' ? '*/' : '\n'); + if (commentEndOffset !== -1) { + closureEndIndex += commentEndOffset; + } + + if (!(Array.isArray(openingMatch) && openingMatch.length >= 4)) { + return [false, 'Malformed or unrecognized license tag.']; + } + const licenseName = checkMagnet(openingMatch[2]); + let message; + if (licenseName) { + outSrc += inSrc.substr(0, closureEndIndex); + partsAccepted = true; + message = `Recognized license: "${licenseName}".` + } else { + outSrc += `\n/*\n${message}\n*/\n`; + partsDenied = true; + message = `Unrecognized license tag: "${openingMatch[0]}"`; + } + reason += `\n${message}`; + + // trim off everything we just evaluated + inSrc = inSrc.substring(closureEndIndex).trim(); + } + + if (partsDenied) { + if (partsAccepted) { + reason = `Some parts of the script have been disabled (check the source for details).\n^--- ${reason}`; + } + return [false, outSrc, reason]; + } + + return [true, scriptSrc, reason]; +} + +module.exports = { init, checkLicenseText, checkMagnet, checkScriptSource }; diff --git a/common/debug.js b/common/debug.js new file mode 100644 index 0000000..b192862 --- /dev/null +++ b/common/debug.js @@ -0,0 +1,37 @@ +/** +* GNU LibreJS - A browser add-on to block nonfree nontrivial JavaScript. +* * +* Copyright (C) 2017, 2018 Nathan Nichols +* Copyright (C) 2018 Ruben Rodriguez +* Copyright (C) 2022 Yuchen Pei +* +* This file is part of GNU LibreJS. +* +* GNU LibreJS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* GNU LibreJS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with GNU LibreJS. If not, see . +*/ + +const makeDebugLogger = (origin, enabled, time) => { + return (a, b) => { + if (enabled) { + console.log('[' + origin + '] Time spent so far: ' + (Date.now() - time) / 1000 + ' seconds'); + if (b === undefined) { + console.log(a); + } else { + console.log(a, b); + } + } + } +} + +module.exports = { makeDebugLogger }; -- cgit v1.2.3