#!/usr/bin/env node /** * Remove duplicates created by addresses from a range also appearing individually * eg. * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists * - 304 Cardigan Street Calton * - 306 Cardigan Street Calton * * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range * */ const fs = require('fs') const { Transform, pipeline } = require('stream') const ndjson = require('ndjson') const withinRange = require('../lib/withinRange.js') const argv = require('yargs/yargs')(process.argv.slice(2)) .option('debug', { type: 'boolean', description: 'Dumps full debug logs' }) .option('verbose', { type: 'boolean', description: 'Verbose logging' }) .argv if (argv._.length < 2) { console.error("Usage: ./reduceRangeDuplicates.js input.geojson output.geojson") process.exit(1) } const inputFile = argv._[0] const outputFile = argv._[1] if (!fs.existsSync(inputFile)) { console.error(`${inputFile} not found`) process.exit(1) } const intermediateFile = `${outputFile}-intermediate.json` function hash(feature) { return [ feature.properties['addr:housenumber'], feature.properties['addr:street'], feature.properties['addr:suburb'], feature.properties['addr:state'], feature.properties['addr:postcode'] ].join('/') } let sourceCount = 0 const rangesByStreet = {} const nonRangesByStreet = {} const rangesRemovedInFilterA = {} // index all non-range addresses by street, suburb, state, postcode const index = new Transform({ readableObjectMode: true, writableObjectMode: true, transform(feature, encoding, callback) { sourceCount++ if (process.stdout.isTTY && sourceCount % 10000 === 0) { process.stdout.write(` ${sourceCount.toLocaleString()}\r`) } const isRange = feature.properties['addr:housenumber'].split('-').length > 1 const key = [ feature.properties['addr:street'], feature.properties['addr:suburb'], feature.properties['addr:state'], feature.properties['addr:postcode'] ].join('/') if (isRange) { if (!(key in rangesByStreet)) { rangesByStreet[key] = [] } rangesByStreet[key].push(feature) } else { if (!(key in nonRangesByStreet)) { nonRangesByStreet[key] = [] } nonRangesByStreet[key].push(feature) } callback() } }) const regexp = /^(?
\D*)(?\d*)(? \D*)$/ /* * First pass, filter A removes ranges where each endpoint of the range exists separately * eg. * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists * - 304 Cardigan Street Calton * - 306 Cardigan Street Calton * * Conditional on the individual addresses not sharing the same geometry, if they do then they are dropped in favour of the range * * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range */ let reduceRangeIndex = 0 const reduceRange = new Transform({ readableObjectMode: true, writableObjectMode: true, transform(feature, encoding, callback) { reduceRangeIndex++ if (process.stdout.isTTY && reduceRangeIndex % 10000 === 0) { process.stdout.write(` ${reduceRangeIndex.toLocaleString()} / ${sourceCount.toLocaleString()} (${Math.round(reduceRangeIndex / sourceCount * 100)}%)\r`) } const isRange = feature.properties['addr:housenumber'].split('-').length > 1 if (isRange) { // see if it can be removed when each end point of the range is included separately const start = feature.properties['addr:housenumber'].split('-')[0] const end = feature.properties['addr:housenumber'].split('-')[1] const key = [ feature.properties['addr:street'], feature.properties['addr:suburb'], feature.properties['addr:state'], feature.properties['addr:postcode'] ].join('/') // find nonRange addresses on the same street if (key in nonRangesByStreet) { const matchCandidates = nonRangesByStreet[key] let foundStart = false let foundEnd = false let matchedStart let matchedEnd let startNum let endNum let pre = '' let suf = '' for (const matchCandidate of matchCandidates) { if (!foundStart && start === matchCandidate.properties['addr:housenumber']) { foundStart = true matchedStart = matchCandidate const match = start.match(regexp) if (match && match.groups) { startNum = match.groups.num pre = match.groups.pre suf = match.groups.suf } } if (!foundEnd && end === matchCandidate.properties['addr:housenumber']) { foundEnd = true matchedEnd = matchCandidate const match = end.match(regexp) if (match && match.groups) { endNum = match.groups.num } } if (foundStart && foundEnd) { // stop early break } } if (foundStart && foundEnd && (!startNum || !endNum)) { // found start and end, but couldn't parse out prefix number suffix console.log(`Filter A: Found start + end, but couldn't parse out prefix number suffix: ${start} - ${end}`) } if (foundStart && foundEnd && startNum && endNum) { // found both start and end // see if any intermediates are missing let foundAllIntermediates = true for (let i = (startNum + 2); i <= (endNum - 2) && foundAllIntermediates === true; i += 2) { let foundIntermediate = false matchCandidates.map(matchCandidate => { if (`${pre}${i}${suf}` === matchCandidate.properties['addr:housenumber']) { foundIntermediate = true } }) if (foundIntermediate === false) { foundAllIntermediates = false } } if (!foundAllIntermediates) { // some intermediates were missing // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results if (argv.verbose) { console.log('Filter A: Found endpoints but some intermediates are missing', feature) } } // if matched start and end point have the same coordinates, then to avoid overlapping points, favour range so retain it if (matchedStart.geometry.coordinates.join(',') === (matchedEnd.geometry.coordinates.join(','))) { if (argv.verbose) { console.log(`Filter A: ${feature.properties['addr:housenumber']} ${feature.properties['addr:street']} ${feature.properties['addr:suburb']} retained because while endpoints exist they share the same geometry`) } this.push(feature) } else { // can be removed, feature not pushed if (argv.verbose) { console.log(`Filter A: ${feature.properties['addr:housenumber']} ${feature.properties['addr:street']} ${feature.properties['addr:suburb']} can be removed`) } // keep track of removed features for filter B, so we don't double remove both range and midpoints rangesRemovedInFilterA[hash(feature)] = true if (argv.debug) { debugStreams['filterA_dropRange'].write(feature) } } } else { // not both start and end found, // if one of start or end found and that start/end has addr:flats... if (foundStart || foundEnd) { // ...if the range has no flats AND the non-range has addr:flats if (!feature.properties['addr:flats'] && ( (matchedStart && matchedStart.properties['addr:flats']) || (matchedEnd && matchedEnd.properties['addr:flats']) )) { // drop the range, eg "112-116 Anderson Street, South Yarra" if (argv.debug) { debugStreams['filterA_dropRangeRangeNoFlatsNonRangeHasFlats'].write(feature) } } else { // then still include the range this.push(feature) } } else { // then still include the range this.push(feature) } } } else { // there are no non-ranges on this street so still include the range this.push(feature) } } else { // else, not a range, we will see if it can be removed in a second pass // shall be removed removed when this non-range exists within a range, but the range wasn't removed already this.push(feature) } callback() } }) /* * Second pass, filter B removes any non-range elements where the range exists, and wasn't removed from the first pass * eg. * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range */ let reduceNonRangeIndex = 0 const reduceNonRange = new Transform({ readableObjectMode: true, writableObjectMode: true, transform(feature, encoding, callback) { reduceNonRangeIndex++ if (process.stdout.isTTY && reduceNonRangeIndex % 10000 === 0) { process.stdout.write(` ${reduceNonRangeIndex.toLocaleString()} / ${sourceCount.toLocaleString()} (${Math.round(reduceNonRangeIndex / sourceCount * 100)}%)\r`) } const isRange = feature.properties['addr:housenumber'].split('-').length > 1 if (!isRange) { // not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already const key = [ feature.properties['addr:street'], feature.properties['addr:suburb'], feature.properties['addr:state'], feature.properties['addr:postcode'] ].join('/') let dropFeature = false let dropReason if (key in rangesByStreet) { for (let i = 0; i < rangesByStreet[key].length; i++) { const range = rangesByStreet[key][i] // if the range wasn't just removed in filter A, and the feature is within the range if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range, { matchParity: true })) { // found within a range, drop feature unless would drop addr:unit or addr:flats information if ('addr:unit' in feature.properties || 'addr:flats' in feature.properties) { // safe to drop if the same addr:unit and addr:flats is also on the range if ( 'addr:unit' in feature.properties ? ('addr:unit' in range.properties && feature.properties['addr:unit'] === range.properties['addr:unit']) : true && 'addr:flats' in feature.properties ? ('addr:flats' in range.properties && feature.properties['addr:flats'] === range.properties['addr:flats']) : true ) { dropReason = `Dropped due to existing range ${range.properties['addr:housenumber']} ${range.properties['addr:street']} ${range.properties._pfi ? '(' + range.properties._pfi + ')' : ''} where flats and unit match` dropFeature = true } else { // since the non-range feature has a unit that the range doesn't have, don't drop it dropFeature = false if (argv.debug) { debugStreams.addrInRangeDifferentUnits.write(feature) debugStreams.addrInRangeDifferentUnits.write(range) debugStreams.addrInRangeDifferentUnits.write({ type: 'Feature', properties: feature.properties, geometry: { type: 'LineString', coordinates: [feature.geometry.coordinates, range.geometry.coordinates] } }) } } } else { // no addr:unit or addr:flats on the feature to safe to drop dropReason = `Dropped due to existing range ${range.properties['addr:housenumber']} ${range.properties['addr:street']} ${range.properties._pfi ? '(' + range.properties._pfi + ')' : ''} without flats or unit to check` dropFeature = true } break } } } if (!dropFeature) { this.push(feature) } else { if (argv.verbose) { console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`) } if (argv.debug) { feature.properties._dropReason = dropReason debugStreams['filterB'].write(feature) } } } else { this.push(feature) } callback() } }) // ndjson streams to output debug features const debugKeys = ['addrInRangeDifferentUnits', 'filterA_dropRangeRangeNoFlatsNonRangeHasFlats', 'filterA_dropRange', 'filterB'] const debugStreams = {} const debugStreamOutputs = {} if (argv.debug) { debugKeys.forEach(key => { debugStreams[key] = ndjson.stringify() debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceRangeDuplicates/${key}.geojson`)) }) } // first pass to index by geometry console.log('Pass 1/2: index non-ranges by street,suburb,state,postcode properties') pipeline( fs.createReadStream(inputFile), ndjson.parse(), index, err => { if (err) { console.log(err) process.exit(1) } else { // second pass to remove range duplicates part A console.log('Pass 2/3: remove range duplicates part A ranges') pipeline( fs.createReadStream(inputFile), ndjson.parse(), reduceRange, ndjson.stringify(), fs.createWriteStream(intermediateFile), err => { if (err) { console.log(err) process.exit(1) } else { console.log('Pass 3/3: remove range duplicates part B endpoints') pipeline( fs.createReadStream(intermediateFile), ndjson.parse(), reduceNonRange, ndjson.stringify(), fs.createWriteStream(outputFile), err => { fs.unlinkSync(intermediateFile) if (err) { console.log(err) process.exit(1) } else { if (argv.debug) { debugKeys.forEach(key => { debugStreams[key].end() }) Promise.all(debugKeys.map(key => { return new Promise(resolve => { debugStreamOutputs[key].on('finish', () => { console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`) resolve() }) }) })) .then(() => { process.exit(0) }) } else { process.exit(0) } } } ) } } ) } } )