4 files changed, 135 insertions, 13 deletions
diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js
index 32155a4..b719124 100755
--- a/bin/reduceRangeDuplicates.js
+++ b/bin/reduceRangeDuplicates.js
@@ -22,6 +22,10 @@ const argv = require('yargs/yargs')(process.argv.slice(2))
     type: 'boolean',
     description: 'Dumps full debug logs'
   })
+  .option('verbose', {
+    type: 'boolean',
+    description: 'Verbose logging'
+  })
   .argv
 
 if (argv._.length < 2) {
@@ -37,10 +41,21 @@ if (!fs.existsSync(inputFile)) {
   process.exit(1)
 }
 
+function hash(feature) {
+  return [
+    feature.properties['addr:housenumber'],
+    feature.properties['addr:street'],
+    feature.properties['addr:suburb'],
+    feature.properties['addr:state'],
+    feature.properties['addr:postcode']
+  ].join('/')
+}
+
 let sourceCount = 0
 
 const ranges = []
 const nonRangesByStreet = {}
+const rangesRemovedInFilterA = {}
 
 // index all non-range addresses by street, suburb, state, postcode
 const index = new Transform({
@@ -78,7 +93,7 @@ const index = new Transform({
 const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/
 
 /*
-* First pass removes ranges where each endpoint of the range exists seperatly
+* second pass, filter A removes ranges where each endpoint of the range exists separately
 * eg.
 *  - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
 *  - 304 Cardigan Street Calton
@@ -98,9 +113,8 @@ const reduceRange = new Transform({
     }
     
     const isRange = feature.properties['addr:housenumber'].split('-').length > 1
-
     if (isRange) {
-      // see if it can be removed when each end point of the range is included seperatly
+      // see if it can be removed when each end point of the range is included separately
       const start = feature.properties['addr:housenumber'].split('-')[0]
       const end = feature.properties['addr:housenumber'].split('-')[1]
 
@@ -123,8 +137,8 @@ const reduceRange = new Transform({
         let pre = ''
         let suf = ''
 
-        matchCandidates.map(matchCandidate => {
-          if (start === matchCandidate.properties['addr:housenumber']) {
+        for (const matchCandidate of matchCandidates) {
+          if (!foundStart && start === matchCandidate.properties['addr:housenumber']) {
             foundStart = true
 
             const match = start.match(regexp)
@@ -132,13 +146,18 @@ const reduceRange = new Transform({
             pre = match.groups.pre
             suf = match.groups.suf
           }
-          if (end === matchCandidate.properties['addr:housenumber']) {
+          if (!foundEnd && end === matchCandidate.properties['addr:housenumber']) {
             foundEnd = true
 
             const match = end.match(regexp)
             endNum = match.groups.num
           }
-        })
+
+          if (foundStart && foundEnd) {
+            // stop early
+            break
+          }
+        }
 
         if (foundStart && foundEnd) {
           // found both start and end
@@ -160,10 +179,18 @@ const reduceRange = new Transform({
           if (!foundAllIntermediates) {
             // some intermediates were missing
             // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results
-            console.log('found endpoints but some intermediates are missing', feature)
+            if (argv.verbose) {
+              console.log('Filter A: Found endpoints but some intermediates are missing', feature)
+            }
           }
 
           // can be removed, feature not pushed
+          if (argv.verbose) {
+            console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`)
+          }
+
+          // keep track of removed features for filter B, so we don't double remove both range and midpoints
+          rangesRemovedInFilterA[hash(feature)] = true
         } else {
           // since not both start and end found, then still include the range
           this.push(feature)
@@ -183,7 +210,7 @@ const reduceRange = new Transform({
 })
 
 /*
-* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass
+* Second pass, filter B removes any non-range elements where the range exists, and wasn't removed from the first pass
 * eg.
 *  - 249-263 Faraday Street
 *  - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
@@ -201,11 +228,12 @@ const reduceNonRange = new Transform({
     const isRange = feature.properties['addr:housenumber'].split('-').length > 1
 
     if (!isRange) {
-      // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already
+      // not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already
       let dropFeature = false
       for (let i = 0; i < ranges.length; i++) {
         const range = ranges[i]
-        if (withinRange(feature, range)) {
+        // if the range wasn't just removed in filter A, and the feature is within the range
+        if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) {
           // found within a range, drop feature unless would drop addr:unit information
           if ('addr:unit' in feature.properties) {
             // safe to drop if the same addr:unit is also on the range
@@ -227,7 +255,13 @@ const reduceNonRange = new Transform({
       }
       if (!dropFeature) {
         this.push(feature)
+      } else {
+        if (argv.verbose) {
+          console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`)
+        }
       }
+    } else {
+      this.push(feature)
     }
 
     callback()
@@ -257,7 +291,8 @@ pipeline(
       console.log(err)
       process.exit(1)
     } else {
-      // second pass to reduce overlapping features
+      console.log('Second pass to remove range duplicates')
+      // second pass to remove range duplicates
       pipeline(
         fs.createReadStream(inputFile),
         ndjson.parse(),
diff --git a/package.json b/package.json
index 388427e..f13575e 100644
--- a/package.json
+++ b/package.json
@@ -6,7 +6,7 @@
   "author": "Andrew Harvey <andrew@alantgeo.com.au>",
   "license": "MIT",
   "scripts": {
-    "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js test/valueLimits.js"
+    "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js test/valueLimits.js test/reduceRangeDuplicates.js"
   },
   "dependencies": {
     "capital-case": "^1.0.4",
@@ -14,6 +14,7 @@
     "clone-deep": "^4.0.1",
     "flatbush": "^3.3.0",
     "geoflatbush": "^1.0.0",
+    "mktemp": "^1.0.0",
     "ndjson": "^2.0.0",
     "readable-stream": "^3.6.0",
     "tape": "^5.2.2",
diff --git a/test/reduceRangeDuplicates.js b/test/reduceRangeDuplicates.js
new file mode 100644
index 0000000..e180841
--- /dev/null
+++ b/test/reduceRangeDuplicates.js
@@ -0,0 +1,81 @@
+const test = require('tape')
+const fs = require('fs')
+const child_process = require('child_process')
+const mktemp = require('mktemp')
+
+function createFeature(housenumber, street, suburb) {
+  return {
+    type: 'Feature',
+    properties: {
+      'addr:housenumber': housenumber,
+      'addr:street': street,
+      'addr:suburb': suburb,
+      'addr:state': 'VIC',
+      'addr:postcode': '0000'
+    },
+    geometry: null
+  }
+}
+
+test('reduceRangeDuplicates', t => {
+  const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
+  const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
+  const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
+
+  const AB = createFeature('304-306', 'Cardigan Street', 'Carlton')
+  const A = createFeature('304', 'Cardigan Street', 'Carlton')
+  const B = createFeature('306', 'Cardigan Street', 'Carlton')
+
+  // all three features to appear in input
+  fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n')
+  fs.appendFileSync(inputFile, JSON.stringify(A) + '\n')
+  fs.appendFileSync(inputFile, JSON.stringify(B) + '\n')
+
+  // output expected to just be endpoints, dropping the range
+  fs.appendFileSync(expectedFile, JSON.stringify(A) + '\n')
+  fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n')
+
+  child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+
+  t.same(
+    fs.readFileSync(outputFile),
+    fs.readFileSync(expectedFile),
+    'range with endpoints appearing separately, drops range'
+  )
+
+  fs.unlinkSync(inputFile)
+  fs.unlinkSync(outputFile)
+  fs.unlinkSync(expectedFile)
+
+  t.end()
+})
+
+test('reduceRangeDuplicates', t => {
+  const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
+  const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
+  const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
+
+  const AC = createFeature('249-263', 'Faraday Street', 'Carlton')
+  const B = createFeature('251', 'Faraday Street', 'Carlton')
+
+  // both features to appear in input
+  fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n')
+  fs.appendFileSync(inputFile, JSON.stringify(B) + '\n')
+
+  // output expected to just be range, dropping the midpoint
+  fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n')
+
+  child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+
+  t.same(
+    fs.readFileSync(outputFile),
+    fs.readFileSync(expectedFile),
+    'range with lone midpoint, drops midpoint'
+  )
+
+  fs.unlinkSync(inputFile)
+  fs.unlinkSync(outputFile)
+  fs.unlinkSync(expectedFile)
+
+  t.end()
+})
diff --git a/yarn.lock b/yarn.lock
index 1b0b2f3..d29620d 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -436,6 +436,11 @@ minimist@^1.2.5:
   resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602"
   integrity sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==
 
+mktemp@^1.0.0:
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/mktemp/-/mktemp-1.0.0.tgz#b670eff23f52d6529e1dc362cb74ddf85448a9e3"
+  integrity sha512-2duBeS0A75x0M3sCoY0R1TiLsYfIBUtNBNWS++eo+bX/ObVqzblqnEQhlaepoBOLD14wklsV3cYxZ68o5qYO8A==
+
 ndjson@^2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/ndjson/-/ndjson-2.0.0.tgz#320ac86f6fe53f5681897349b86ac6f43bfa3a19"