From eb4b63480a165f7cca44a7f067e5de613340440e Mon Sep 17 00:00:00 2001 From: Andrew Harvey Date: Wed, 30 Jun 2021 22:17:45 +1000 Subject: fixes to reduceRangeDuplicates, require two passes for A/B rather than a single pass --- .vscode/launch.json | 12 ++- bin/reduceRangeDuplicates.js | 113 +++++++++++++++------ .../reduceRangeDuplicates/expectedOutput1.geojson | 2 + test/fixtures/reduceRangeDuplicates/input1.geojson | 3 + test/reduceRangeDuplicates.js | 85 +++++++++++----- 5 files changed, 156 insertions(+), 59 deletions(-) create mode 100644 test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson create mode 100644 test/fixtures/reduceRangeDuplicates/input1.geojson diff --git a/.vscode/launch.json b/.vscode/launch.json index 707d6d2..291dcce 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -61,8 +61,18 @@ "skipFiles": [ "/**" ], + "program": "${workspaceFolder}/node_modules/.bin/tape", + "args": ["test/reduceRangeDuplicates.js"] + }, + { + "type": "pwa-node", + "request": "launch", + "name": "bin/reduceRangeDuplicates fixture1", + "skipFiles": [ + "/**" + ], "program": "${workspaceFolder}/bin/reduceRangeDuplicates.js", - "args": ["/tmp/input_POZXt.geojson", "/tmp/output/0OIov.geojson"] + "args": ["test/fixtures/reduceRangeDuplicates/input1.geojson", "test/fixtures/reduceRangeDuplicates/actualOutput1.geojson"] }, { "type": "pwa-node", diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js index 6fac0b5..4ef57f9 100755 --- a/bin/reduceRangeDuplicates.js +++ b/bin/reduceRangeDuplicates.js @@ -41,6 +41,8 @@ if (!fs.existsSync(inputFile)) { process.exit(1) } +const intermediateFile = `${outputFile}-intermediate.json` + function hash(feature) { return [ feature.properties['addr:housenumber'], @@ -95,11 +97,13 @@ const index = new Transform({ const regexp = /^(?
\D*)(?\d*)(?\D*)$/
 
 /*
-* second pass, filter A removes ranges where each endpoint of the range exists separately
+* First pass, filter A removes ranges where each endpoint of the range exists separately
 * eg.
 *  - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
 *  - 304 Cardigan Street Calton
 *  - 306 Cardigan Street Calton
+*
+*  Conditional on the individual addresses not sharing the same geometry, if they do then they are dropped in favour of the range
 * 
 *  - 249-263 Faraday Street
 *  - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
@@ -200,13 +204,25 @@ const reduceRange = new Transform({
             }
           }
 
-          // can be removed, feature not pushed
-          if (argv.verbose) {
-            console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`)
-          }
+          // if matched start and end point have the same coordinates, then to avoid overlapping points, favour range so retain it
+          if (matchedStart.geometry.coordinates.join(',') === (matchedEnd.geometry.coordinates.join(','))) {
+            if (argv.verbose) {
+              console.log(`Filter A: ${feature.properties['addr:housenumber']} ${feature.properties['addr:street']} ${feature.properties['addr:suburb']} retained because while endpoints exist they share the same geometry`)
+            }
+            this.push(feature)
+          } else {
+            // can be removed, feature not pushed
+            if (argv.verbose) {
+              console.log(`Filter A: ${feature.properties['addr:housenumber']} ${feature.properties['addr:street']} ${feature.properties['addr:suburb']} can be removed`)
+            }
+
+            // keep track of removed features for filter B, so we don't double remove both range and midpoints
+            rangesRemovedInFilterA[hash(feature)] = true
 
-          // keep track of removed features for filter B, so we don't double remove both range and midpoints
-          rangesRemovedInFilterA[hash(feature)] = true
+            if (argv.debug) {
+              debugStreams['filterA_dropRange'].write(feature)
+            }
+          }
         } else {
           // not both start and end found,
           // if one of start or end found and that start/end has addr:flats...
@@ -216,6 +232,9 @@ const reduceRange = new Transform({
               (matchedStart && matchedStart.properties['addr:flats']) || (matchedEnd && matchedEnd.properties['addr:flats'])
             )) {
               // drop the range, eg "112-116 Anderson Street, South Yarra"
+              if (argv.debug) {
+                debugStreams['filterA_dropRangeRangeNoFlatsNonRangeHasFlats'].write(feature)
+              }
             } else {
               // then still include the range
               this.push(feature)
@@ -285,8 +304,17 @@ const reduceNonRange = new Transform({
                   // since the non-range feature has a unit that the range doesn't have, don't drop it
                   dropFeature = false
                   if (argv.debug) {
-                    debugStreams['addrInRangeDifferentUnits'].write(feature)
-                    debugStreams['addrInRangeDifferentUnits'].write(range)
+                    debugStreams.addrInRangeDifferentUnits.write(feature)
+                    debugStreams.addrInRangeDifferentUnits.write(range)
+
+                    debugStreams.addrInRangeDifferentUnits.write({
+                      type: 'Feature',
+                      properties: feature.properties,
+                      geometry: {
+                        type: 'LineString',
+                        coordinates: [feature.geometry.coordinates, range.geometry.coordinates]
+                      }
+                    })
                   }
                 }
             } else {
@@ -297,12 +325,16 @@ const reduceNonRange = new Transform({
           }
         }
       }
+
       if (!dropFeature) {
         this.push(feature)
       } else {
         if (argv.verbose) {
           console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`)
         }
+        if (argv.debug) {
+          debugStreams['filterB'].write(feature)
+        }
       }
     } else {
       this.push(feature)
@@ -313,7 +345,7 @@ const reduceNonRange = new Transform({
 })
 
 // ndjson streams to output debug features
-const debugKeys = ['addrInRangeDifferentUnits']
+const debugKeys = ['addrInRangeDifferentUnits', 'filterA_dropRangeRangeNoFlatsNonRangeHasFlats', 'filterA_dropRange', 'filterB']
 const debugStreams = {}
 const debugStreamOutputs = {}
 
@@ -335,39 +367,54 @@ pipeline(
       console.log(err)
       process.exit(1)
     } else {
-      // second pass to remove range duplicates
-      console.log('Pass 2/2: remove range duplicates')
+      // second pass to remove range duplicates part A
+      console.log('Pass 2/3: remove range duplicates part A ranges')
       pipeline(
         fs.createReadStream(inputFile),
         ndjson.parse(),
         reduceRange,
-        reduceNonRange,
         ndjson.stringify(),
-        fs.createWriteStream(outputFile),
+        fs.createWriteStream(intermediateFile),
         err => {
           if (err) {
             console.log(err)
             process.exit(1)
           } else {
-            if (argv.debug) {
-              debugKeys.forEach(key => {
-                debugStreams[key].end()
-              })
-
-              Promise.all(debugKeys.map(key => {
-                return new Promise(resolve => {
-                  debugStreamOutputs[key].on('finish', () => {
-                    console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`)
-                    resolve()
-                  })
-                })
-              }))
-                .then(() => {
-                  process.exit(0)
-                })
-            } else {
-              process.exit(0)
-            }
+            console.log('Pass 3/3: remove range duplicates part B endpoints')
+            pipeline(
+              fs.createReadStream(intermediateFile),
+              ndjson.parse(),
+              reduceNonRange,
+              ndjson.stringify(),
+              fs.createWriteStream(outputFile),
+              err => {
+                fs.unlinkSync(intermediateFile)
+                if (err) {
+                  console.log(err)
+                  process.exit(1)
+                } else {
+                  if (argv.debug) {
+                    debugKeys.forEach(key => {
+                      debugStreams[key].end()
+                    })
+
+                    Promise.all(debugKeys.map(key => {
+                      return new Promise(resolve => {
+                        debugStreamOutputs[key].on('finish', () => {
+                          console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`)
+                          resolve()
+                        })
+                      })
+                    }))
+                      .then(() => {
+                        process.exit(0)
+                      })
+                  } else {
+                    process.exit(0)
+                  }
+                }
+              }
+            )
           }
         }
       )
diff --git a/test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson b/test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson
new file mode 100644
index 0000000..1e8f2c7
--- /dev/null
+++ b/test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson
@@ -0,0 +1,2 @@
+{"type":"Feature","properties":{"_pfi":"50586965,50585085,425955908","addr:housenumber":"39","addr:street":"Linton Street","addr:suburb":"Balaclava","addr:state":"VIC","addr:postcode":"3183","addr:unit":null,"addr:flats":null,"addr:flats2":null,"addr:flats3":null,"addr:flats4":null,"addr:flats5":null,"addr:flats6":null,"addr:flats7":null},"geometry":{"type":"Point","coordinates":[144.9924932,-37.8669844]}}
+{"type":"Feature","properties":{"_pfi":"50585086,425955907","addr:housenumber":"41","addr:street":"Linton Street","addr:suburb":"Balaclava","addr:state":"VIC","addr:postcode":"3183","addr:unit":null,"addr:flats":null,"addr:flats2":null,"addr:flats3":null,"addr:flats4":null,"addr:flats5":null,"addr:flats6":null,"addr:flats7":null},"geometry":{"type":"Point","coordinates":[144.992463,-37.8670328]}}
diff --git a/test/fixtures/reduceRangeDuplicates/input1.geojson b/test/fixtures/reduceRangeDuplicates/input1.geojson
new file mode 100644
index 0000000..d91155d
--- /dev/null
+++ b/test/fixtures/reduceRangeDuplicates/input1.geojson
@@ -0,0 +1,3 @@
+{ "type": "Feature", "properties": { "_pfi": "50586965,50585085,425955908", "addr:housenumber": "39", "addr:street": "Linton Street", "addr:suburb": "Balaclava", "addr:state": "VIC", "addr:postcode": "3183", "addr:unit": null, "addr:flats": null, "addr:flats2": null, "addr:flats3": null, "addr:flats4": null, "addr:flats5": null, "addr:flats6": null, "addr:flats7": null }, "geometry": { "type": "Point", "coordinates": [ 144.9924932, -37.8669844 ] } }
+{ "type": "Feature", "properties": { "_pfi": "206441264", "addr:housenumber": "39-41", "addr:street": "Linton Street", "addr:suburb": "Balaclava", "addr:state": "VIC", "addr:postcode": "3183", "addr:unit": null, "addr:flats": null, "addr:flats2": null, "addr:flats3": null, "addr:flats4": null, "addr:flats5": null, "addr:flats6": null, "addr:flats7": null }, "geometry": { "type": "Point", "coordinates": [ 144.9925274, -37.8670149 ] } }
+{ "type": "Feature", "properties": { "_pfi": "50585086,425955907", "addr:housenumber": "41", "addr:street": "Linton Street", "addr:suburb": "Balaclava", "addr:state": "VIC", "addr:postcode": "3183", "addr:unit": null, "addr:flats": null, "addr:flats2": null, "addr:flats3": null, "addr:flats4": null, "addr:flats5": null, "addr:flats6": null, "addr:flats7": null }, "geometry": { "type": "Point", "coordinates": [ 144.992463, -37.8670328 ] } }
diff --git a/test/reduceRangeDuplicates.js b/test/reduceRangeDuplicates.js
index 091390c..21c80c4 100644
--- a/test/reduceRangeDuplicates.js
+++ b/test/reduceRangeDuplicates.js
@@ -3,7 +3,7 @@ const fs = require('fs')
 const child_process = require('child_process')
 const mktemp = require('mktemp')
 
-function createFeature(unit, housenumber, street, suburb, flats) {
+function createFeature(coordinates, unit, housenumber, street, suburb, flats) {
   return {
     type: 'Feature',
     properties: {
@@ -15,18 +15,21 @@ function createFeature(unit, housenumber, street, suburb, flats) {
       'addr:state': 'VIC',
       'addr:postcode': '0000'
     },
-    geometry: null
+    geometry: coordinates ? {
+      type: 'Point',
+      coordinates: coordinates
+    } : null
   }
 }
 
-test('reduceRangeDuplicates', t => {
+test('reduceRangeDuplicates distinct geometries', t => {
   const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
   const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
   const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
 
-  const AB = createFeature(null, '304-306', 'Cardigan Street', 'Carlton')
-  const A = createFeature(null, '304', 'Cardigan Street', 'Carlton')
-  const B = createFeature(null, '306', 'Cardigan Street', 'Carlton')
+  const AB = createFeature([0, 0], null, '304-306', 'Cardigan Street', 'Carlton')
+  const A = createFeature([-1, 0], null, '304', 'Cardigan Street', 'Carlton')
+  const B = createFeature([1, 0], null, '306', 'Cardigan Street', 'Carlton')
 
   // all three features to appear in input
   fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n')
@@ -37,11 +40,11 @@ test('reduceRangeDuplicates', t => {
   fs.appendFileSync(expectedFile, JSON.stringify(A) + '\n')
   fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n')
 
-  child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+  child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`)
 
   t.same(
-    fs.readFileSync(outputFile),
-    fs.readFileSync(expectedFile),
+    fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse),
+    fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse),
     'range with endpoints appearing separately, drops range'
   )
 
@@ -52,13 +55,45 @@ test('reduceRangeDuplicates', t => {
   t.end()
 })
 
+test('reduceRangeDuplicates overlapping geometries', t => {
+  const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
+  const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
+  const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
+
+  const AB = createFeature([0, 0], null, '304-306', 'Cardigan Street', 'Carlton')
+  const A = createFeature([0, 0], null, '304', 'Cardigan Street', 'Carlton')
+  const B = createFeature([0, 0], null, '306', 'Cardigan Street', 'Carlton')
+
+  // all three features to appear in input
+  fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n')
+  fs.appendFileSync(inputFile, JSON.stringify(A) + '\n')
+  fs.appendFileSync(inputFile, JSON.stringify(B) + '\n')
+
+  // output expected to drop the endpoints and retain the range since endpoints are overlapping
+  fs.appendFileSync(expectedFile, JSON.stringify(AB) + '\n')
+
+  child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`)
+
+  t.same(
+    fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse),
+    fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse),
+    'range with endpoints appearing separately but overlapping, drops the endpoints'
+  )
+
+  fs.unlinkSync(inputFile)
+  fs.unlinkSync(outputFile)
+  fs.unlinkSync(expectedFile)
+
+  t.end()
+})
+
 test('reduceRangeDuplicates', t => {
   const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
   const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
   const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
 
-  const AC = createFeature(null, '249-263', 'Faraday Street', 'Carlton')
-  const B = createFeature(null, '251', 'Faraday Street', 'Carlton')
+  const AC = createFeature(null, null, '249-263', 'Faraday Street', 'Carlton')
+  const B = createFeature(null, null, '251', 'Faraday Street', 'Carlton')
 
   // both features to appear in input
   fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n')
@@ -67,11 +102,11 @@ test('reduceRangeDuplicates', t => {
   // output expected to just be range, dropping the midpoint
   fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n')
 
-  child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+  child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`)
 
   t.same(
-    fs.readFileSync(outputFile),
-    fs.readFileSync(expectedFile),
+    fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse),
+    fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse),
     'range with lone midpoint, drops midpoint'
   )
 
@@ -87,8 +122,8 @@ test('reduceRangeDuplicates', t => {
   const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
   const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
 
-  const AC = createFeature(null, '249-263', 'Faraday Street', 'Carlton')
-  const B = createFeature('1', '251', 'Faraday Street', 'Carlton')
+  const AC = createFeature(null, null, '249-263', 'Faraday Street', 'Carlton')
+  const B = createFeature(null, '1', '251', 'Faraday Street', 'Carlton')
 
   // both features to appear in input
   fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n')
@@ -98,11 +133,11 @@ test('reduceRangeDuplicates', t => {
   fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n')
   fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n')
 
-  child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+  child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`)
 
   t.same(
-    fs.readFileSync(outputFile),
-    fs.readFileSync(expectedFile),
+    fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse),
+    fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse),
     'midpoint with unit not dropped'
   )
 
@@ -118,22 +153,22 @@ test('reduceRangeDuplicates', t => {
   const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
   const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
 
-  const AC = createFeature(null, '249-263', 'Faraday Street', 'Carlton')
-  const B = createFeature(null, '251', 'Faraday Street', 'Carlton', '1;2;3')
+  const AC = createFeature(null, null, '249-263', 'Faraday Street', 'Carlton')
+  const B = createFeature(null, null, '251', 'Faraday Street', 'Carlton', '1;2;3')
 
   // both features to appear in input
   fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n')
   fs.appendFileSync(inputFile, JSON.stringify(B) + '\n')
 
-  // output expected to both features because dropping the midpoint would loose the unit
+  // output expected to both features because dropping the midpoint would loose the flats
   fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n')
   fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n')
 
-  child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+  child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`)
 
   t.same(
-    fs.readFileSync(outputFile),
-    fs.readFileSync(expectedFile),
+    fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse),
+    fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse),
     'midpoint with flats not dropped'
   )
 
-- 
cgit v1.2.3