From 8581d5182b9b0b126b6cf690e40740ebf5aa1c75 Mon Sep 17 00:00:00 2001 From: Andrew Harvey Date: Fri, 7 May 2021 22:00:39 +1000 Subject: reduce numbers within a range duplication --- Makefile | 4 + README.md | 14 +- bin/reduceRangeDuplicates.js | 295 +++++++++++++++++++++++++++++++++++++++++++ lib/withinRange.js | 45 +++++++ package.json | 2 +- test/withinRange.js | 107 ++++++++++++++++ 6 files changed, 465 insertions(+), 2 deletions(-) create mode 100755 bin/reduceRangeDuplicates.js create mode 100644 lib/withinRange.js create mode 100644 test/withinRange.js diff --git a/Makefile b/Makefile index 5ea62e0..6b1198b 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,10 @@ dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson mkdir -p debug/reduceOverlap node --max_old_space_size=4096 ./bin/reduceOverlap.js --debug $< $@ +dist/vicmap-osm-uniq-flats-withinrange.geojson: dist/vicmap-osm-uniq-flats.geojson + mkdir -p debug/reduceRangeDuplicates + node --max_old_space_size=4096 ./bin/reduceRangeDuplicates.js --debug $< $@ + loadPgOSM: dist/vicmap-osm.geojson ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vm_osm diff --git a/README.md b/README.md index cfbd015..1ce0b18 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,14 @@ Remove duplicates where all address attributes match at the same location or wit Reduce some address points with the same coordinates but different address attributes (see _Overlapping points_ below) (code at `bin/reduceOverlap.js`): - make dist/vicmap-osm-flats.geojson + make dist/vicmap-osm-uniq-flats.geojson This is only done for strictly overlapping points, where the geometry varies slightly then that's okay we don't attempt to combine. +Drop address ranges where the range endpoints are seperatly mapped. + + make dist/vicmap-osm-flats-withinrange.geojson + ### Omitted addresses Source addresses are omitted where they: @@ -42,6 +46,14 @@ Since these addresses have no identifying attribute beyond street, and there is These rules are defined in `filterOSM.js`. +#### Duplicates through mixed range/individual points + +Some addresses appear as both a range and individual points. For example one address as `1-5` but additional addresses as `1`, `3` and `5`. + +Where the endpoints of the range match existing non-range address points, and where the unit value is the same, and where the individual points have different geometries the range address is dropped in favour of the indivdiual points. + +Where the individual points share the same geometry as each other, then the range is favoured and the individual points are dropped. + ### OSM schema - `addr:unit` is constructed either as a single value or range where the building unit is supplied diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js new file mode 100755 index 0000000..ed85d75 --- /dev/null +++ b/bin/reduceRangeDuplicates.js @@ -0,0 +1,295 @@ +#!/usr/bin/env node + +/** + * Remove duplicates created by addresses from a range also appearing individually + * eg. + * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists + * - 304 Cardigan Street Calton + * - 306 Cardigan Street Calton + * + * - 249-263 Faraday Street + * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range + * + */ + +const fs = require('fs') +const { Transform, pipeline } = require('stream') +const ndjson = require('ndjson') +const withinRange = require('./lib/withinRange.js') + +const argv = require('yargs/yargs')(process.argv.slice(2)) + .option('debug', { + type: 'boolean', + description: 'Dumps full debug logs' + }) + .argv + +if (argv._.length < 2) { + console.error("Usage: ./reduceRangeDuplicates.js input.geojson output.geojson") + process.exit(1) +} + +const inputFile = argv._[0] +const outputFile = argv._[1] + +if (!fs.existsSync(inputFile)) { + console.error(`${inputFile} not found`) + process.exit(1) +} + +let sourceCount = 0 + +const ranges = [] +const nonRangesByStreet = {} + +// index all non-range addresses by street, suburb, state, postcode +const index = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + sourceCount++ + + if (sourceCount % 10000 === 0) { + process.stdout.write(` ${sourceCount / 1000}k\r`) + } + + const isRange = feature.properties['addr:housenumber'].split('-').length > 1 + + if (isRange) { + ranges.push(feature) + } else { + const key = [ + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') + + if (!(key in nonRangesByStreet)) { + nonRangesByStreet[key] = [] + } + nonRangesByStreet[key].push(feature) + } + + callback() + } +}) + +const regexp = /^(?
\D*)(?\d*)(?\D*)$/
+
+/*
+* First pass removes ranges where each endpoint of the range exists seperatly
+* eg.
+*  - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
+*  - 304 Cardigan Street Calton
+*  - 306 Cardigan Street Calton
+* 
+*  - 249-263 Faraday Street
+*  - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
+*/
+let reduceRangeIndex = 0
+const reduceRange = new Transform({
+  readableObjectMode: true,
+  writableObjectMode: true,
+  transform(feature, encoding, callback) {
+    reduceRangeIndex++
+    if (reduceRangeIndex % 10000 === 0) {
+      process.stdout.write(` ${reduceRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceRangeIndex / sourceCount * 100)}%)\r`)
+    }
+    
+    const isRange = feature.properties['addr:housenumber'].split('-').length > 1
+
+    if (isRange) {
+      // see if it can be removed when each end point of the range is included seperatly
+      const start = feature.properties['addr:housenumber'].split('-')[0]
+      const end = feature.properties['addr:housenumber'].split('-')[1]
+
+      const key = [
+        feature.properties['addr:street'],
+        feature.properties['addr:suburb'],
+        feature.properties['addr:state'],
+        feature.properties['addr:postcode']
+      ].join('/')
+
+      // find nonRange addresses on the same street
+      if (key in nonRangesByStreet) {
+        const matchCandidates = nonRangesByStreet[key]
+
+        let foundStart = false
+        let foundEnd = false
+
+        let startNum
+        let endNum
+        let pre = ''
+        let suf = ''
+
+        matchCandidates.map(matchCandidate => {
+          if (start === matchCandidate.properties['addr:housenumber']) {
+            foundStart = true
+
+            const match = start.match(regexp)
+            startNum = match.groups.num
+            pre = match.groups.pre
+            suf = match.groups.suf
+          }
+          if (end === matchCandidate.properties['addr:housenumber']) {
+            foundEnd = true
+
+            const match = end.match(regexp)
+            endNum = match.groups.num
+          }
+        })
+
+        if (foundStart && foundEnd) {
+          // found both start and end
+
+          // see if any intermediates are missing
+          const foundAllIntermediates = true
+          for (let i = (startNum + 2); i <= (endNum - 2) && foundAllIntermediates === true; i += 2) {
+            let foundIntermediate = false
+            matchCandidates.map(matchCandidate => {
+              if (`${pre}${i}${suf}` === matchCandidate.properties['addr:housenumber']) {
+                foundIntermediate = true
+              }
+            })
+
+            if (foundIntermediate === false) {
+              foundAllIntermediates = false
+            }
+          }
+          if (!foundAllIntermediates) {
+            // some intermediates were missing
+            // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results
+            console.log('found endpoints but some intermediates are missing', feature)
+          }
+
+          // can be removed, feature not pushed
+        } else {
+          // since not both start and end found, then still include the range
+          this.push(feature)
+        }
+      } else {
+        // there are no non-ranges on this street so still include the range
+        this.push(feature)
+      }
+    } else {
+      // else, not a range, we will see if it can be removed in a second pass
+      // shall be removed removed when this non-range exists within a range, but the range wasn't removed already
+      this.push(feature)
+    }
+
+    callback()
+  }
+})
+
+/*
+* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass
+* eg.
+*  - 249-263 Faraday Street
+*  - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
+*/
+let reduceNonRangeIndex = 0
+const reduceNonRange = new Transform({
+  readableObjectMode: true,
+  writableObjectMode: true,
+  transform(feature, encoding, callback) {
+    reduceNonRangeIndex++
+    if (reduceNonRangeIndex % 10000 === 0) {
+      process.stdout.write(` ${reduceNonRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceNonRangeIndex / sourceCount * 100)}%)\r`)
+    }
+    
+    const isRange = feature.properties['addr:housenumber'].split('-').length > 1
+
+    if (!isRange) {
+      // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already
+      let dropFeature = false
+      ranges.forEach(range => {
+        if (withinRange(feature, range)) {
+          // found within a range, drop feature unless would drop addr:unit information
+          if ('addr:unit' in feature.properties) {
+            // safe to drop if the same addr:unit is also on the range
+            if ('addr:unit' in range.properties &&
+              feature.properties['addr:unit'] === range.properties['addr:unit']) {
+                dropFeature = true
+              } else {
+                // since the non-range feature has a unit that the range doesn't have, don't drop it
+                dropFeature = false
+                debugStreams['addrInRangeDifferentUnits'].write(feature)
+                debugStreams['addrInRangeDifferentUnits'].write(range)
+              }
+          } else {
+            // no addr:unit on the feature to safe to drop
+            dropFeature = true
+          }
+          break
+        }
+      })
+      if (!dropFeature) {
+        this.push(feature)
+      }
+    }
+
+    callback()
+  }
+})
+
+// ndjson streams to output debug features
+const debugKeys = ['addrInRangeDifferentUnits']
+const debugStreams = {}
+const debugStreamOutputs = {}
+
+if (argv.debug) {
+  debugKeys.forEach(key => {
+    debugStreams[key] = ndjson.stringify()
+    debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceRangeDuplicates/${key}.geojson`))
+  })
+}
+
+// first pass to index by geometry
+console.log('First pass to index non-ranges by street,suburb,state,postcode properties')
+pipeline(
+  fs.createReadStream(inputFile),
+  ndjson.parse(),
+  index,
+  err => {
+    if (err) {
+      console.log(err)
+      process.exit(1)
+    } else {
+      // second pass to reduce overlapping features
+      pipeline(
+        fs.createReadStream(inputFile),
+        reduceRange,
+        reduceNonRange,
+        ndjson.stringify(),
+        fs.createWriteStream(outputFile),
+        err => {
+          if (err) {
+            console.log(err)
+            process.exit(1)
+          } else {
+            if (argv.debug) {
+              debugKeys.forEach(key => {
+                debugStreams[key].end()
+              })
+
+              Promise.all(debugKeys.map(key => {
+                return new Promise(resolve => {
+                  debugStreamOutputs[key].on('finish', () => {
+                    console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`)
+                    resolve()
+                  })
+                })
+              }))
+                .then(() => {
+                  process.exit(0)
+                })
+            } else {
+              process.exit(0)
+            }
+          }
+        }
+      )
+    }
+  }
+)
diff --git a/lib/withinRange.js b/lib/withinRange.js
new file mode 100644
index 0000000..e75f788
--- /dev/null
+++ b/lib/withinRange.js
@@ -0,0 +1,45 @@
+/**
+ * @param {Object} feature
+ * @param {Object} rangeFeature
+ *
+ * @returns {boolean} True if addr:housenumber of feature is within the range of addr:housenumber rangeFeature and all other addr:* attributes match
+ */
+module.exports = (feature, rangeFeature) => {
+  const regexp = /^(?
\D*)(?\d*)(?\D*)$/
+
+  if (
+    // must have a housenumber
+    'addr:housenumber' in feature.properties &&
+    'addr:housenumber' in rangeFeature.properties &&
+
+    // must have a street and street must match
+    'addr:street' in feature.properties &&
+    'addr:street' in rangeFeature.properties &&
+    feature.properties['addr:street'] === rangeFeature.properties['addr:street'] &&
+
+    // other higher attributes must match if exists
+    feature.properties['addr:suburb'] === rangeFeature.properties['addr:suburb'] &&
+    feature.properties['addr:state'] === rangeFeature.properties['addr:state'] &&
+    feature.properties['addr:postcode'] === rangeFeature.properties['addr:postcode']
+  ) {
+    const rangeParts = rangeFeature.properties['addr:housenumber'].split('-')
+    if (rangeParts.length === 2) {
+      const from = rangeParts[0].match(regexp).groups
+      const to = rangeParts[1].match(regexp).groups
+
+      const i = feature.properties['addr:housenumber'].match(regexp).groups
+      if (i.num >= from.num && i.num <= to.num) {
+        // feature within featureRange (ignore prefix/suffix)
+        return true
+      } else {
+        return false
+      }
+
+    } else {
+      // range is not actually  a range
+      return false
+    }
+  } else {
+    return false
+  }
+}
diff --git a/package.json b/package.json
index a3a45f0..825e344 100644
--- a/package.json
+++ b/package.json
@@ -6,7 +6,7 @@
   "author": "Andrew Harvey ",
   "license": "MIT",
   "scripts": {
-    "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js"
+    "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js"
   },
   "dependencies": {
     "capital-case": "^1.0.4",
diff --git a/test/withinRange.js b/test/withinRange.js
new file mode 100644
index 0000000..1158c20
--- /dev/null
+++ b/test/withinRange.js
@@ -0,0 +1,107 @@
+const test = require('tape')
+
+const withinRange = require('../lib/withinRange.js')
+
+const A = {
+  "type": "Feature",
+  "properties": {
+    "addr:housenumber": "1",
+    "addr:street": "Main Street"
+  },
+  "geometry": {
+    "type": "Point",
+    "coordinates": [0, 0]
+  }
+}
+const B = {
+  "type": "Feature",
+  "properties": {
+    "addr:housenumber": "2",
+    "addr:street": "Main Street"
+  },
+  "geometry": {
+    "type": "Point",
+    "coordinates": [0, 0]
+  }
+}
+const C = {
+  "type": "Feature",
+  "properties": {
+    "addr:housenumber": "3",
+    "addr:street": "Main Street"
+  },
+  "geometry": {
+    "type": "Point",
+    "coordinates": [0, 0]
+  }
+}
+const AB = {
+  "type": "Feature",
+  "properties": {
+    "addr:housenumber": "1-2",
+    "addr:street": "Main Street"
+  },
+  "geometry": {
+    "type": "Point",
+    "coordinates": [0, 0]
+  }
+}
+const AC = {
+  "type": "Feature",
+  "properties": {
+    "addr:housenumber": "1-3",
+    "addr:street": "Main Street"
+  },
+  "geometry": {
+    "type": "Point",
+    "coordinates": [0, 0]
+  }
+}
+
+const AC_2 = {
+  "type": "Feature",
+  "properties": {
+    "addr:housenumber": "1-3",
+    "addr:street": "Second Street"
+  },
+  "geometry": {
+    "type": "Point",
+    "coordinates": [0, 0]
+  }
+}
+
+
+test('withinRange', t => {
+  t.same(
+    withinRange(A, AB),
+    true,
+    'A within AB'
+  )
+  t.same(
+    withinRange(A, AC),
+    true,
+    'A within AC'
+  )
+  t.same(
+    withinRange(B, AB),
+    true,
+    'B within AB'
+  )
+  t.same(
+    withinRange(B, AC),
+    true,
+    'B within AC'
+  )
+  t.same(
+    withinRange(C, AB),
+    false,
+    'C not within AB'
+  )
+  t.same(
+    withinRange(A, AC_2),
+    false,
+    'A Main Street not within AC Secondary Street'
+  )
+
+  t.end()
+})
-- 
cgit v1.2.3