aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--README.md14
-rwxr-xr-xbin/reduceRangeDuplicates.js295
-rw-r--r--lib/withinRange.js45
-rw-r--r--package.json2
-rw-r--r--test/withinRange.js107
6 files changed, 465 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index 5ea62e0..6b1198b 100644
--- a/Makefile
+++ b/Makefile
@@ -48,6 +48,10 @@ dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson
mkdir -p debug/reduceOverlap
node --max_old_space_size=4096 ./bin/reduceOverlap.js --debug $< $@
+dist/vicmap-osm-uniq-flats-withinrange.geojson: dist/vicmap-osm-uniq-flats.geojson
+ mkdir -p debug/reduceRangeDuplicates
+ node --max_old_space_size=4096 ./bin/reduceRangeDuplicates.js --debug $< $@
+
loadPgOSM: dist/vicmap-osm.geojson
ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vm_osm
diff --git a/README.md b/README.md
index cfbd015..1ce0b18 100644
--- a/README.md
+++ b/README.md
@@ -28,10 +28,14 @@ Remove duplicates where all address attributes match at the same location or wit
Reduce some address points with the same coordinates but different address attributes (see _Overlapping points_ below) (code at `bin/reduceOverlap.js`):
- make dist/vicmap-osm-flats.geojson
+ make dist/vicmap-osm-uniq-flats.geojson
This is only done for strictly overlapping points, where the geometry varies slightly then that's okay we don't attempt to combine.
+Drop address ranges where the range endpoints are seperatly mapped.
+
+ make dist/vicmap-osm-flats-withinrange.geojson
+
### Omitted addresses
Source addresses are omitted where they:
@@ -42,6 +46,14 @@ Since these addresses have no identifying attribute beyond street, and there is
These rules are defined in `filterOSM.js`.
+#### Duplicates through mixed range/individual points
+
+Some addresses appear as both a range and individual points. For example one address as `1-5` but additional addresses as `1`, `3` and `5`.
+
+Where the endpoints of the range match existing non-range address points, and where the unit value is the same, and where the individual points have different geometries the range address is dropped in favour of the indivdiual points.
+
+Where the individual points share the same geometry as each other, then the range is favoured and the individual points are dropped.
+
### OSM schema
- `addr:unit` is constructed either as a single value or range where the building unit is supplied
diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js
new file mode 100755
index 0000000..ed85d75
--- /dev/null
+++ b/bin/reduceRangeDuplicates.js
@@ -0,0 +1,295 @@
+#!/usr/bin/env node
+
+/**
+ * Remove duplicates created by addresses from a range also appearing individually
+ * eg.
+ * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
+ * - 304 Cardigan Street Calton
+ * - 306 Cardigan Street Calton
+ *
+ * - 249-263 Faraday Street
+ * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
+ *
+ */
+
+const fs = require('fs')
+const { Transform, pipeline } = require('stream')
+const ndjson = require('ndjson')
+const withinRange = require('./lib/withinRange.js')
+
+const argv = require('yargs/yargs')(process.argv.slice(2))
+ .option('debug', {
+ type: 'boolean',
+ description: 'Dumps full debug logs'
+ })
+ .argv
+
+if (argv._.length < 2) {
+ console.error("Usage: ./reduceRangeDuplicates.js input.geojson output.geojson")
+ process.exit(1)
+}
+
+const inputFile = argv._[0]
+const outputFile = argv._[1]
+
+if (!fs.existsSync(inputFile)) {
+ console.error(`${inputFile} not found`)
+ process.exit(1)
+}
+
+let sourceCount = 0
+
+const ranges = []
+const nonRangesByStreet = {}
+
+// index all non-range addresses by street, suburb, state, postcode
+const index = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ sourceCount++
+
+ if (sourceCount % 10000 === 0) {
+ process.stdout.write(` ${sourceCount / 1000}k\r`)
+ }
+
+ const isRange = feature.properties['addr:housenumber'].split('-').length > 1
+
+ if (isRange) {
+ ranges.push(feature)
+ } else {
+ const key = [
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
+
+ if (!(key in nonRangesByStreet)) {
+ nonRangesByStreet[key] = []
+ }
+ nonRangesByStreet[key].push(feature)
+ }
+
+ callback()
+ }
+})
+
+const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/
+
+/*
+* First pass removes ranges where each endpoint of the range exists seperatly
+* eg.
+* - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
+* - 304 Cardigan Street Calton
+* - 306 Cardigan Street Calton
+*
+* - 249-263 Faraday Street
+* - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
+*/
+let reduceRangeIndex = 0
+const reduceRange = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ reduceRangeIndex++
+ if (reduceRangeIndex % 10000 === 0) {
+ process.stdout.write(` ${reduceRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceRangeIndex / sourceCount * 100)}%)\r`)
+ }
+
+ const isRange = feature.properties['addr:housenumber'].split('-').length > 1
+
+ if (isRange) {
+ // see if it can be removed when each end point of the range is included seperatly
+ const start = feature.properties['addr:housenumber'].split('-')[0]
+ const end = feature.properties['addr:housenumber'].split('-')[1]
+
+ const key = [
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
+
+ // find nonRange addresses on the same street
+ if (key in nonRangesByStreet) {
+ const matchCandidates = nonRangesByStreet[key]
+
+ let foundStart = false
+ let foundEnd = false
+
+ let startNum
+ let endNum
+ let pre = ''
+ let suf = ''
+
+ matchCandidates.map(matchCandidate => {
+ if (start === matchCandidate.properties['addr:housenumber']) {
+ foundStart = true
+
+ const match = start.match(regexp)
+ startNum = match.groups.num
+ pre = match.groups.pre
+ suf = match.groups.suf
+ }
+ if (end === matchCandidate.properties['addr:housenumber']) {
+ foundEnd = true
+
+ const match = end.match(regexp)
+ endNum = match.groups.num
+ }
+ })
+
+ if (foundStart && foundEnd) {
+ // found both start and end
+
+ // see if any intermediates are missing
+ const foundAllIntermediates = true
+ for (let i = (startNum + 2); i <= (endNum - 2) && foundAllIntermediates === true; i += 2) {
+ let foundIntermediate = false
+ matchCandidates.map(matchCandidate => {
+ if (`${pre}${i}${suf}` === matchCandidate.properties['addr:housenumber']) {
+ foundIntermediate = true
+ }
+ })
+
+ if (foundIntermediate === false) {
+ foundAllIntermediates = false
+ }
+ }
+ if (!foundAllIntermediates) {
+ // some intermediates were missing
+ // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results
+ console.log('found endpoints but some intermediates are missing', feature)
+ }
+
+ // can be removed, feature not pushed
+ } else {
+ // since not both start and end found, then still include the range
+ this.push(feature)
+ }
+ } else {
+ // there are no non-ranges on this street so still include the range
+ this.push(feature)
+ }
+ } else {
+ // else, not a range, we will see if it can be removed in a second pass
+ // shall be removed removed when this non-range exists within a range, but the range wasn't removed already
+ this.push(feature)
+ }
+
+ callback()
+ }
+})
+
+/*
+* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass
+* eg.
+* - 249-263 Faraday Street
+* - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
+*/
+let reduceNonRangeIndex = 0
+const reduceNonRange = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ reduceNonRangeIndex++
+ if (reduceNonRangeIndex % 10000 === 0) {
+ process.stdout.write(` ${reduceNonRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceNonRangeIndex / sourceCount * 100)}%)\r`)
+ }
+
+ const isRange = feature.properties['addr:housenumber'].split('-').length > 1
+
+ if (!isRange) {
+ // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already
+ let dropFeature = false
+ ranges.forEach(range => {
+ if (withinRange(feature, range)) {
+ // found within a range, drop feature unless would drop addr:unit information
+ if ('addr:unit' in feature.properties) {
+ // safe to drop if the same addr:unit is also on the range
+ if ('addr:unit' in range.properties &&
+ feature.properties['addr:unit'] === range.properties['addr:unit']) {
+ dropFeature = true
+ } else {
+ // since the non-range feature has a unit that the range doesn't have, don't drop it
+ dropFeature = false
+ debugStreams['addrInRangeDifferentUnits'].write(feature)
+ debugStreams['addrInRangeDifferentUnits'].write(range)
+ }
+ } else {
+ // no addr:unit on the feature to safe to drop
+ dropFeature = true
+ }
+ break
+ }
+ })
+ if (!dropFeature) {
+ this.push(feature)
+ }
+ }
+
+ callback()
+ }
+})
+
+// ndjson streams to output debug features
+const debugKeys = ['addrInRangeDifferentUnits']
+const debugStreams = {}
+const debugStreamOutputs = {}
+
+if (argv.debug) {
+ debugKeys.forEach(key => {
+ debugStreams[key] = ndjson.stringify()
+ debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceRangeDuplicates/${key}.geojson`))
+ })
+}
+
+// first pass to index by geometry
+console.log('First pass to index non-ranges by street,suburb,state,postcode properties')
+pipeline(
+ fs.createReadStream(inputFile),
+ ndjson.parse(),
+ index,
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ // second pass to reduce overlapping features
+ pipeline(
+ fs.createReadStream(inputFile),
+ reduceRange,
+ reduceNonRange,
+ ndjson.stringify(),
+ fs.createWriteStream(outputFile),
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ if (argv.debug) {
+ debugKeys.forEach(key => {
+ debugStreams[key].end()
+ })
+
+ Promise.all(debugKeys.map(key => {
+ return new Promise(resolve => {
+ debugStreamOutputs[key].on('finish', () => {
+ console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`)
+ resolve()
+ })
+ })
+ }))
+ .then(() => {
+ process.exit(0)
+ })
+ } else {
+ process.exit(0)
+ }
+ }
+ }
+ )
+ }
+ }
+)
diff --git a/lib/withinRange.js b/lib/withinRange.js
new file mode 100644
index 0000000..e75f788
--- /dev/null
+++ b/lib/withinRange.js
@@ -0,0 +1,45 @@
+/**
+ * @param {Object} feature
+ * @param {Object} rangeFeature
+ *
+ * @returns {boolean} True if addr:housenumber of feature is within the range of addr:housenumber rangeFeature and all other addr:* attributes match
+ */
+module.exports = (feature, rangeFeature) => {
+ const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/
+
+ if (
+ // must have a housenumber
+ 'addr:housenumber' in feature.properties &&
+ 'addr:housenumber' in rangeFeature.properties &&
+
+ // must have a street and street must match
+ 'addr:street' in feature.properties &&
+ 'addr:street' in rangeFeature.properties &&
+ feature.properties['addr:street'] === rangeFeature.properties['addr:street'] &&
+
+ // other higher attributes must match if exists
+ feature.properties['addr:suburb'] === rangeFeature.properties['addr:suburb'] &&
+ feature.properties['addr:state'] === rangeFeature.properties['addr:state'] &&
+ feature.properties['addr:postcode'] === rangeFeature.properties['addr:postcode']
+ ) {
+ const rangeParts = rangeFeature.properties['addr:housenumber'].split('-')
+ if (rangeParts.length === 2) {
+ const from = rangeParts[0].match(regexp).groups
+ const to = rangeParts[1].match(regexp).groups
+
+ const i = feature.properties['addr:housenumber'].match(regexp).groups
+ if (i.num >= from.num && i.num <= to.num) {
+ // feature within featureRange (ignore prefix/suffix)
+ return true
+ } else {
+ return false
+ }
+
+ } else {
+ // range is not actually a range
+ return false
+ }
+ } else {
+ return false
+ }
+}
diff --git a/package.json b/package.json
index a3a45f0..825e344 100644
--- a/package.json
+++ b/package.json
@@ -6,7 +6,7 @@
"author": "Andrew Harvey <andrew@alantgeo.com.au>",
"license": "MIT",
"scripts": {
- "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js"
+ "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js"
},
"dependencies": {
"capital-case": "^1.0.4",
diff --git a/test/withinRange.js b/test/withinRange.js
new file mode 100644
index 0000000..1158c20
--- /dev/null
+++ b/test/withinRange.js
@@ -0,0 +1,107 @@
+const test = require('tape')
+
+const withinRange = require('../lib/withinRange.js')
+
+const A = {
+ "type": "Feature",
+ "properties": {
+ "addr:housenumber": "1",
+ "addr:street": "Main Street"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ }
+}
+const B = {
+ "type": "Feature",
+ "properties": {
+ "addr:housenumber": "2",
+ "addr:street": "Main Street"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ }
+}
+const C = {
+ "type": "Feature",
+ "properties": {
+ "addr:housenumber": "3",
+ "addr:street": "Main Street"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ }
+}
+const AB = {
+ "type": "Feature",
+ "properties": {
+ "addr:housenumber": "1-2",
+ "addr:street": "Main Street"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ }
+}
+const AC = {
+ "type": "Feature",
+ "properties": {
+ "addr:housenumber": "1-3",
+ "addr:street": "Main Street"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ }
+}
+
+const AC_2 = {
+ "type": "Feature",
+ "properties": {
+ "addr:housenumber": "1-3",
+ "addr:street": "Second Street"
+ },
+ "geometry": {
+ "type": "Point",
+ "coordinates": [0, 0]
+ }
+}
+
+
+test('withinRange', t => {
+ t.same(
+ withinRange(A, AB),
+ true,
+ 'A within AB'
+ )
+ t.same(
+ withinRange(A, AC),
+ true,
+ 'A within AC'
+ )
+ t.same(
+ withinRange(B, AB),
+ true,
+ 'B within AB'
+ )
+ t.same(
+ withinRange(B, AC),
+ true,
+ 'B within AC'
+ )
+ t.same(
+ withinRange(C, AB),
+ false,
+ 'C not within AB'
+ )
+ t.same(
+ withinRange(A, AC_2),
+ false,
+ 'A Main Street not within AC Secondary Street'
+ )
+
+ t.end()
+})