diff options
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | README.md | 14 | ||||
-rwxr-xr-x | bin/reduceRangeDuplicates.js | 295 | ||||
-rw-r--r-- | lib/withinRange.js | 45 | ||||
-rw-r--r-- | package.json | 2 | ||||
-rw-r--r-- | test/withinRange.js | 107 |
6 files changed, 465 insertions, 2 deletions
@@ -48,6 +48,10 @@ dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson mkdir -p debug/reduceOverlap node --max_old_space_size=4096 ./bin/reduceOverlap.js --debug $< $@ +dist/vicmap-osm-uniq-flats-withinrange.geojson: dist/vicmap-osm-uniq-flats.geojson + mkdir -p debug/reduceRangeDuplicates + node --max_old_space_size=4096 ./bin/reduceRangeDuplicates.js --debug $< $@ + loadPgOSM: dist/vicmap-osm.geojson ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vm_osm @@ -28,10 +28,14 @@ Remove duplicates where all address attributes match at the same location or wit Reduce some address points with the same coordinates but different address attributes (see _Overlapping points_ below) (code at `bin/reduceOverlap.js`): - make dist/vicmap-osm-flats.geojson + make dist/vicmap-osm-uniq-flats.geojson This is only done for strictly overlapping points, where the geometry varies slightly then that's okay we don't attempt to combine. +Drop address ranges where the range endpoints are seperatly mapped. + + make dist/vicmap-osm-flats-withinrange.geojson + ### Omitted addresses Source addresses are omitted where they: @@ -42,6 +46,14 @@ Since these addresses have no identifying attribute beyond street, and there is These rules are defined in `filterOSM.js`. +#### Duplicates through mixed range/individual points + +Some addresses appear as both a range and individual points. For example one address as `1-5` but additional addresses as `1`, `3` and `5`. + +Where the endpoints of the range match existing non-range address points, and where the unit value is the same, and where the individual points have different geometries the range address is dropped in favour of the indivdiual points. + +Where the individual points share the same geometry as each other, then the range is favoured and the individual points are dropped. + ### OSM schema - `addr:unit` is constructed either as a single value or range where the building unit is supplied diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js new file mode 100755 index 0000000..ed85d75 --- /dev/null +++ b/bin/reduceRangeDuplicates.js @@ -0,0 +1,295 @@ +#!/usr/bin/env node + +/** + * Remove duplicates created by addresses from a range also appearing individually + * eg. + * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists + * - 304 Cardigan Street Calton + * - 306 Cardigan Street Calton + * + * - 249-263 Faraday Street + * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range + * + */ + +const fs = require('fs') +const { Transform, pipeline } = require('stream') +const ndjson = require('ndjson') +const withinRange = require('./lib/withinRange.js') + +const argv = require('yargs/yargs')(process.argv.slice(2)) + .option('debug', { + type: 'boolean', + description: 'Dumps full debug logs' + }) + .argv + +if (argv._.length < 2) { + console.error("Usage: ./reduceRangeDuplicates.js input.geojson output.geojson") + process.exit(1) +} + +const inputFile = argv._[0] +const outputFile = argv._[1] + +if (!fs.existsSync(inputFile)) { + console.error(`${inputFile} not found`) + process.exit(1) +} + +let sourceCount = 0 + +const ranges = [] +const nonRangesByStreet = {} + +// index all non-range addresses by street, suburb, state, postcode +const index = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + sourceCount++ + + if (sourceCount % 10000 === 0) { + process.stdout.write(` ${sourceCount / 1000}k\r`) + } + + const isRange = feature.properties['addr:housenumber'].split('-').length > 1 + + if (isRange) { + ranges.push(feature) + } else { + const key = [ + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') + + if (!(key in nonRangesByStreet)) { + nonRangesByStreet[key] = [] + } + nonRangesByStreet[key].push(feature) + } + + callback() + } +}) + +const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/ + +/* +* First pass removes ranges where each endpoint of the range exists seperatly +* eg. +* - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists +* - 304 Cardigan Street Calton +* - 306 Cardigan Street Calton +* +* - 249-263 Faraday Street +* - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range +*/ +let reduceRangeIndex = 0 +const reduceRange = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + reduceRangeIndex++ + if (reduceRangeIndex % 10000 === 0) { + process.stdout.write(` ${reduceRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceRangeIndex / sourceCount * 100)}%)\r`) + } + + const isRange = feature.properties['addr:housenumber'].split('-').length > 1 + + if (isRange) { + // see if it can be removed when each end point of the range is included seperatly + const start = feature.properties['addr:housenumber'].split('-')[0] + const end = feature.properties['addr:housenumber'].split('-')[1] + + const key = [ + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') + + // find nonRange addresses on the same street + if (key in nonRangesByStreet) { + const matchCandidates = nonRangesByStreet[key] + + let foundStart = false + let foundEnd = false + + let startNum + let endNum + let pre = '' + let suf = '' + + matchCandidates.map(matchCandidate => { + if (start === matchCandidate.properties['addr:housenumber']) { + foundStart = true + + const match = start.match(regexp) + startNum = match.groups.num + pre = match.groups.pre + suf = match.groups.suf + } + if (end === matchCandidate.properties['addr:housenumber']) { + foundEnd = true + + const match = end.match(regexp) + endNum = match.groups.num + } + }) + + if (foundStart && foundEnd) { + // found both start and end + + // see if any intermediates are missing + const foundAllIntermediates = true + for (let i = (startNum + 2); i <= (endNum - 2) && foundAllIntermediates === true; i += 2) { + let foundIntermediate = false + matchCandidates.map(matchCandidate => { + if (`${pre}${i}${suf}` === matchCandidate.properties['addr:housenumber']) { + foundIntermediate = true + } + }) + + if (foundIntermediate === false) { + foundAllIntermediates = false + } + } + if (!foundAllIntermediates) { + // some intermediates were missing + // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results + console.log('found endpoints but some intermediates are missing', feature) + } + + // can be removed, feature not pushed + } else { + // since not both start and end found, then still include the range + this.push(feature) + } + } else { + // there are no non-ranges on this street so still include the range + this.push(feature) + } + } else { + // else, not a range, we will see if it can be removed in a second pass + // shall be removed removed when this non-range exists within a range, but the range wasn't removed already + this.push(feature) + } + + callback() + } +}) + +/* +* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass +* eg. +* - 249-263 Faraday Street +* - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range +*/ +let reduceNonRangeIndex = 0 +const reduceNonRange = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + reduceNonRangeIndex++ + if (reduceNonRangeIndex % 10000 === 0) { + process.stdout.write(` ${reduceNonRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceNonRangeIndex / sourceCount * 100)}%)\r`) + } + + const isRange = feature.properties['addr:housenumber'].split('-').length > 1 + + if (!isRange) { + // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already + let dropFeature = false + ranges.forEach(range => { + if (withinRange(feature, range)) { + // found within a range, drop feature unless would drop addr:unit information + if ('addr:unit' in feature.properties) { + // safe to drop if the same addr:unit is also on the range + if ('addr:unit' in range.properties && + feature.properties['addr:unit'] === range.properties['addr:unit']) { + dropFeature = true + } else { + // since the non-range feature has a unit that the range doesn't have, don't drop it + dropFeature = false + debugStreams['addrInRangeDifferentUnits'].write(feature) + debugStreams['addrInRangeDifferentUnits'].write(range) + } + } else { + // no addr:unit on the feature to safe to drop + dropFeature = true + } + break + } + }) + if (!dropFeature) { + this.push(feature) + } + } + + callback() + } +}) + +// ndjson streams to output debug features +const debugKeys = ['addrInRangeDifferentUnits'] +const debugStreams = {} +const debugStreamOutputs = {} + +if (argv.debug) { + debugKeys.forEach(key => { + debugStreams[key] = ndjson.stringify() + debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceRangeDuplicates/${key}.geojson`)) + }) +} + +// first pass to index by geometry +console.log('First pass to index non-ranges by street,suburb,state,postcode properties') +pipeline( + fs.createReadStream(inputFile), + ndjson.parse(), + index, + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + // second pass to reduce overlapping features + pipeline( + fs.createReadStream(inputFile), + reduceRange, + reduceNonRange, + ndjson.stringify(), + fs.createWriteStream(outputFile), + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + if (argv.debug) { + debugKeys.forEach(key => { + debugStreams[key].end() + }) + + Promise.all(debugKeys.map(key => { + return new Promise(resolve => { + debugStreamOutputs[key].on('finish', () => { + console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`) + resolve() + }) + }) + })) + .then(() => { + process.exit(0) + }) + } else { + process.exit(0) + } + } + } + ) + } + } +) diff --git a/lib/withinRange.js b/lib/withinRange.js new file mode 100644 index 0000000..e75f788 --- /dev/null +++ b/lib/withinRange.js @@ -0,0 +1,45 @@ +/** + * @param {Object} feature + * @param {Object} rangeFeature + * + * @returns {boolean} True if addr:housenumber of feature is within the range of addr:housenumber rangeFeature and all other addr:* attributes match + */ +module.exports = (feature, rangeFeature) => { + const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/ + + if ( + // must have a housenumber + 'addr:housenumber' in feature.properties && + 'addr:housenumber' in rangeFeature.properties && + + // must have a street and street must match + 'addr:street' in feature.properties && + 'addr:street' in rangeFeature.properties && + feature.properties['addr:street'] === rangeFeature.properties['addr:street'] && + + // other higher attributes must match if exists + feature.properties['addr:suburb'] === rangeFeature.properties['addr:suburb'] && + feature.properties['addr:state'] === rangeFeature.properties['addr:state'] && + feature.properties['addr:postcode'] === rangeFeature.properties['addr:postcode'] + ) { + const rangeParts = rangeFeature.properties['addr:housenumber'].split('-') + if (rangeParts.length === 2) { + const from = rangeParts[0].match(regexp).groups + const to = rangeParts[1].match(regexp).groups + + const i = feature.properties['addr:housenumber'].match(regexp).groups + if (i.num >= from.num && i.num <= to.num) { + // feature within featureRange (ignore prefix/suffix) + return true + } else { + return false + } + + } else { + // range is not actually a range + return false + } + } else { + return false + } +} diff --git a/package.json b/package.json index a3a45f0..825e344 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ "author": "Andrew Harvey <andrew@alantgeo.com.au>", "license": "MIT", "scripts": { - "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js" + "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js" }, "dependencies": { "capital-case": "^1.0.4", diff --git a/test/withinRange.js b/test/withinRange.js new file mode 100644 index 0000000..1158c20 --- /dev/null +++ b/test/withinRange.js @@ -0,0 +1,107 @@ +const test = require('tape') + +const withinRange = require('../lib/withinRange.js') + +const A = { + "type": "Feature", + "properties": { + "addr:housenumber": "1", + "addr:street": "Main Street" + }, + "geometry": { + "type": "Point", + "coordinates": [0, 0] + } +} +const B = { + "type": "Feature", + "properties": { + "addr:housenumber": "2", + "addr:street": "Main Street" + }, + "geometry": { + "type": "Point", + "coordinates": [0, 0] + } +} +const C = { + "type": "Feature", + "properties": { + "addr:housenumber": "3", + "addr:street": "Main Street" + }, + "geometry": { + "type": "Point", + "coordinates": [0, 0] + } +} +const AB = { + "type": "Feature", + "properties": { + "addr:housenumber": "1-2", + "addr:street": "Main Street" + }, + "geometry": { + "type": "Point", + "coordinates": [0, 0] + } +} +const AC = { + "type": "Feature", + "properties": { + "addr:housenumber": "1-3", + "addr:street": "Main Street" + }, + "geometry": { + "type": "Point", + "coordinates": [0, 0] + } +} + +const AC_2 = { + "type": "Feature", + "properties": { + "addr:housenumber": "1-3", + "addr:street": "Second Street" + }, + "geometry": { + "type": "Point", + "coordinates": [0, 0] + } +} + + +test('withinRange', t => { + t.same( + withinRange(A, AB), + true, + 'A within AB' + ) + t.same( + withinRange(A, AC), + true, + 'A within AC' + ) + t.same( + withinRange(B, AB), + true, + 'B within AB' + ) + t.same( + withinRange(B, AC), + true, + 'B within AC' + ) + t.same( + withinRange(C, AB), + false, + 'C not within AB' + ) + t.same( + withinRange(A, AC_2), + false, + 'A Main Street not within AC Secondary Street' + ) + + t.end() +}) |