aboutsummaryrefslogtreecommitdiff
path: root/contact_finder.js
diff options
context:
space:
mode:
authorNateN1222 <nathannichols454@gmail.com>2017-08-07 12:16:14 -0500
committerNateN1222 <nathannichols454@gmail.com>2017-08-07 12:16:14 -0500
commit6a92e509017e0169c896b3e221a8673c5b490509 (patch)
tree145dbd80c7e4d9c9f12e621eaf38b0d60a9e1d1c /contact_finder.js
parent54e50fc09ca1c37d61119cd01942c9fa9427b237 (diff)
started to implemented the regexes from the current LibreJS into the contact finder
Diffstat (limited to 'contact_finder.js')
-rw-r--r--contact_finder.js222
1 files changed, 193 insertions, 29 deletions
diff --git a/contact_finder.js b/contact_finder.js
index 759b3b0..69b9254 100644
--- a/contact_finder.js
+++ b/contact_finder.js
@@ -7,6 +7,100 @@
*/
// Now, the contact finder will load on every page and you can test it where ever you want.
+
+//*********************************************************************************************
+//Regexes taken from "contact_regex.js" in the current LibreJS
+//Copyright (C) 2011, 2012, 2014 Loic J. Duros
+//Copyright (C) 2014, 2015 Nik Nyby
+
+// email address regexp
+var reEmail = /^mailto\:(admin|feedback|webmaster|info|contact|support|comments|team|help)\@[a-z0-9.\-]+\.[a-z]{2,4}$/i;
+
+var reAnyEmail = /^mailto\:.*?\@[a-z0-9\.\-]+\.[a-z]{2,4}$/i;
+
+// twitter address regexp
+var reTwitter = /twitter\.com\/(\!?#\/)?[a-z0-9]*/i;
+
+// identi.ca address regexp
+var reIdentiCa = /identi\.ca\/(?!notice\/)[a-z0-9]*/i;
+
+/**
+ * contactSearchStrings
+ * Contains arrays of strings classified by language
+ * and by degree of certainty.
+ */
+var contactStr = {
+ 'da': {
+ 'certain': [
+ '^[\\s]*Kontakt os[\\s]*$',
+ '^[\\s]*Email Os[\\s]*$',
+ '^[\\s]*Kontakt[\\s]*$'
+ ],
+ 'probable': ['^[\\s]Kontakt', '^[\\s]*Email'],
+ 'uncertain': [
+ '^[\\s]*Om Us',
+ '^[\\s]*Om',
+ 'Hvem vi er'
+ ]
+ },
+ 'en': {
+ 'certain': [
+ '^[\\s]*Contact Us[\\s]*$',
+ '^[\\s]*Email Us[\\s]*$',
+ '^[\\s]*Contact[\\s]*$',
+ '^[\\s]*Feedback[\\s]*$',
+ '^[\\s]*Web.?site Feedback[\\s]*$'
+ ],
+ 'probable': ['^[\\s]Contact', '^[\\s]*Email'],
+ 'uncertain': [
+ '^[\\s]*About Us',
+ '^[\\s]*About',
+ 'Who we are',
+ 'Who I am',
+ 'Company Info',
+ 'Customer Service'
+ ]
+ },
+ 'es': {
+ 'certain': [
+ '^[\\s]*contáctenos[\\s]*$',
+ '^[\\s]*Email[\\s]*$'
+ ],
+ 'probable': ['^[\\s]contáctenos', '^[\\s]*Email'],
+ 'uncertain': [
+ 'Acerca de nosotros'
+ ]
+ },
+ 'fr': {
+ 'certain': [
+ '^[\\s]*Contactez nous[\\s]*$',
+ '^[\\s]*(Nous )?contacter[\\s]*$',
+ '^[\\s]*Email[\\s]*$',
+ '^[\\s]*Contact[\\s]*$',
+ '^[\\s]*Commentaires[\\s]*$'
+ ],
+ 'probable': ['^[\\s]Contact', '^[\\s]*Email'],
+ 'uncertain': [
+ '^[\\s]*(A|À) propos',
+ 'Qui nous sommes',
+ 'Qui suis(-| )?je',
+ 'Info',
+ 'Service Client(e|è)le'
+ ]
+ }
+};
+
+var usaPhoneNumber = new RegExp(/(?:\+ ?1 ?)?\(?[2-9]{1}[0-9]{2}\)?(?:\-|\.| )?[0-9]{3}(?:\-|\.| )[0-9]{4}(?:[^0-9])/mg);
+
+//*********************************************************************************************
+
+
+/**
+*
+* Creates a transparent floating button from a name string and a callback
+*
+*
+*/
var button_i = 0;
if(document.getElementById("abc123_main_div") !== null){
document.getElementById("abc123_main_div").remove();
@@ -22,55 +116,125 @@ function new_debug_button(name_text,callback){
button_i = button_i + 1;
}
/**
+* returns input with all elements not of type string removed
+*/
+function remove_not_str(a){
+ var new_a = [];
+ for(var i in a){
+ if(typeof(a[i]) == "string"){
+ new_a.push(a[i])
+ }
+ }
+ return new_a;
+}
+/**
+* Tests all links on the page for regexes under a certain certainty level.
+*
+* Will return either the first regex match from the selected certainty level or all regexes that
+* match on that certainty level.
+*
+* certainty_lvl can be "certain" > "probable" > "uncertain"
+*/
+function attempt(certainty_lvl, first=true){
+ // There needs to be some kind of max so that people can't troll by for example leaving a comment with a bunch of emails
+ // to cause LibreJS users to slow down.
+ var fail_flag = true;
+ var flag;
+ var matches = [];
+ var result = [];
+ var str_under_test = "";
+ for(var i in document.links){
+ if( typeof(document.links[i].innerText) != "string" || typeof(document.links[i].href) != "string"){
+ continue;
+ }
+ str_under_test = document.links[i].innerText + " " + document.links[i].href;
+ flag = true;
+ for(var j in contactStr){
+ for(var k in contactStr[j][certainty_lvl]){
+ if(flag){
+ result = [];
+ result = str_under_test.match(new RegExp(contactStr[j][certainty_lvl][k],"g"));
+ result = remove_not_str(result);
+ if(result !== undefined && typeof(result[0]) == "string" ){
+ if(first){
+ return {"fail":false,"result":document.links[i]};
+ } else{
+ console.log(document.links[i].href + " matched " + contactStr[j][certainty_lvl][k]);
+ matches.push(document.links[i]);
+ fail_flag = false;
+ flag = false;
+ }
+ }
+ }
+ }
+ }
+ }
+ console.log(matches);
+ return {"fail":fail_flag,"result":matches};
+}
+
+/**
* "LibreJS detects contact pages, email addresses that are likely to be owned by the
* maintainer of the site, Twitter and identi.ca links, and phone numbers."
*/
function find_contacts(){
- var all = document.documentElement.innerHTML;
- var emails = [];
- emails.push(all.match(/\S+@\S+\.\S+\b/g));
- // 1.555.123.4567
- //+1.555.123.4567
- var phone_num = [];
- phone_num.push(all.match(/(\d{1,3}\.)?(\d\d\d)\.(\d\d\d)\.(\d\d\d\d)/g));
- // 1-555-123-4567
- //+1-555-123-4567
- phone_num.push(all.match(/(\+?\d)?([\-|\.])(\d\d\d)\2(\d\d\d)\2(\d\d\d\d)/g));
- // +15554567890
- phone_num.push(all.match(/\+?\d{10,15}\b/g));
- // twitter handles
- var twitter = [];
- twitter.push(all.match(/@\w{3,15}\b/g));
- // twitter links
+ var all = document.documentElement.innerText;
+ var phone_num = [];
var twitlinks = [];
- twitlinks.push(all.match(/twitter\.com\/\w{3,15}\b/g));
- // identi.ca link
- // 25 is my guess at the max username length (I don't actually know)
var identi = [];
- identi.push(all.match(/identi\.ca\/\w{3,25}\b/g));
- // Attempt to find contact pages
- var contact_pages = [];
- var links = document.getElementsByTagName("a");
- for(i in links){
- if(links[i].href !== undefined && links[i].href.indexOf("contact") != -1){
- contact_pages.push(links[i]);
+ var contact_pages = [];
+ console.log("certain:");
+ var res = attempt("certain");
+ var flag = true;
+ if(res["fail"] == false){
+ console.log("certain contact found:" + res["result"]);
+ res = res["result"];
+ flag = false;
+ }
+ if(flag){
+ console.log("probable:");
+ res = attempt("probable");
+ if(res["fail"] == false){
+ console.log("probable contact found:" + res["result"]);
+ res = res["result"];
+ flag = false;
+ }
+ }
+ if(flag){
+ console.log("uncertain:");
+ res = attempt("uncertain");
+ console.log(res);
+ if(res["fail"] == false){
+ console.log("uncertain contact found:" + res["result"]);
+ res = res["result"];
+ flag = false;
}
}
+ if(flag){
+ console.log("No contact found");
+ }
+ console.log("final result:");
+ console.log(res);
+ /*
console.log("********************************************************");
console.log("%c RESULTS: ","color: #dd0000;");
console.log("%c " + phone_num.length + "%c phone numbers","color: red;","color: purple;");
- console.log("%c " + twitter.length + "%c twitter handles","color: red;","color: purple;");
console.log("%c " + twitlinks.length + "%c twitter links","color: red;","color: purple;");
console.log("%c " + identi.length + "%c identi.ca links","color: red;","color: purple;");
console.log("%c " + contact_pages.length + "%c possible contact pages","color: red;","color: purple;");
console.log("********************************************************");
+ */
-
+}
+// need to have this so the handler doesn't take too long
+function handler(){
+ find_contacts();
+ return 0;
}
-new_debug_button("Complain to website",find_contacts);
+new_debug_button("Complain to website",handler);
new_debug_button("Remove these buttons",function(){
if(document.getElementById("abc123_main_div") !== null){
document.getElementById("abc123_main_div").remove();