diff options
author | Andrew Harvey <andrew@alantgeo.com.au> | 2021-05-15 18:38:43 +1000 |
---|---|---|
committer | Andrew Harvey <andrew@alantgeo.com.au> | 2021-05-15 18:38:43 +1000 |
commit | 566b7ace89a00d64aa9b94d4659d01f79bd31fe9 (patch) | |
tree | 838ec1b3bd90d43e14a8e05b463f1359f3979590 /bin/reduceRangeDuplicates.js | |
parent | 116b43955003421b7c6b6f781b0e0cc329c64063 (diff) |
fix reduceRangeDuplicates and add tests
Diffstat (limited to 'bin/reduceRangeDuplicates.js')
-rwxr-xr-x | bin/reduceRangeDuplicates.js | 59 |
1 files changed, 47 insertions, 12 deletions
diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js index 32155a4..b719124 100755 --- a/bin/reduceRangeDuplicates.js +++ b/bin/reduceRangeDuplicates.js @@ -22,6 +22,10 @@ const argv = require('yargs/yargs')(process.argv.slice(2)) type: 'boolean', description: 'Dumps full debug logs' }) + .option('verbose', { + type: 'boolean', + description: 'Verbose logging' + }) .argv if (argv._.length < 2) { @@ -37,10 +41,21 @@ if (!fs.existsSync(inputFile)) { process.exit(1) } +function hash(feature) { + return [ + feature.properties['addr:housenumber'], + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') +} + let sourceCount = 0 const ranges = [] const nonRangesByStreet = {} +const rangesRemovedInFilterA = {} // index all non-range addresses by street, suburb, state, postcode const index = new Transform({ @@ -78,7 +93,7 @@ const index = new Transform({ const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/ /* -* First pass removes ranges where each endpoint of the range exists seperatly +* second pass, filter A removes ranges where each endpoint of the range exists separately * eg. * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists * - 304 Cardigan Street Calton @@ -98,9 +113,8 @@ const reduceRange = new Transform({ } const isRange = feature.properties['addr:housenumber'].split('-').length > 1 - if (isRange) { - // see if it can be removed when each end point of the range is included seperatly + // see if it can be removed when each end point of the range is included separately const start = feature.properties['addr:housenumber'].split('-')[0] const end = feature.properties['addr:housenumber'].split('-')[1] @@ -123,8 +137,8 @@ const reduceRange = new Transform({ let pre = '' let suf = '' - matchCandidates.map(matchCandidate => { - if (start === matchCandidate.properties['addr:housenumber']) { + for (const matchCandidate of matchCandidates) { + if (!foundStart && start === matchCandidate.properties['addr:housenumber']) { foundStart = true const match = start.match(regexp) @@ -132,13 +146,18 @@ const reduceRange = new Transform({ pre = match.groups.pre suf = match.groups.suf } - if (end === matchCandidate.properties['addr:housenumber']) { + if (!foundEnd && end === matchCandidate.properties['addr:housenumber']) { foundEnd = true const match = end.match(regexp) endNum = match.groups.num } - }) + + if (foundStart && foundEnd) { + // stop early + break + } + } if (foundStart && foundEnd) { // found both start and end @@ -160,10 +179,18 @@ const reduceRange = new Transform({ if (!foundAllIntermediates) { // some intermediates were missing // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results - console.log('found endpoints but some intermediates are missing', feature) + if (argv.verbose) { + console.log('Filter A: Found endpoints but some intermediates are missing', feature) + } } // can be removed, feature not pushed + if (argv.verbose) { + console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`) + } + + // keep track of removed features for filter B, so we don't double remove both range and midpoints + rangesRemovedInFilterA[hash(feature)] = true } else { // since not both start and end found, then still include the range this.push(feature) @@ -183,7 +210,7 @@ const reduceRange = new Transform({ }) /* -* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass +* Second pass, filter B removes any non-range elements where the range exists, and wasn't removed from the first pass * eg. * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range @@ -201,11 +228,12 @@ const reduceNonRange = new Transform({ const isRange = feature.properties['addr:housenumber'].split('-').length > 1 if (!isRange) { - // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already + // not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already let dropFeature = false for (let i = 0; i < ranges.length; i++) { const range = ranges[i] - if (withinRange(feature, range)) { + // if the range wasn't just removed in filter A, and the feature is within the range + if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) { // found within a range, drop feature unless would drop addr:unit information if ('addr:unit' in feature.properties) { // safe to drop if the same addr:unit is also on the range @@ -227,7 +255,13 @@ const reduceNonRange = new Transform({ } if (!dropFeature) { this.push(feature) + } else { + if (argv.verbose) { + console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`) + } } + } else { + this.push(feature) } callback() @@ -257,7 +291,8 @@ pipeline( console.log(err) process.exit(1) } else { - // second pass to reduce overlapping features + console.log('Second pass to remove range duplicates') + // second pass to remove range duplicates pipeline( fs.createReadStream(inputFile), ndjson.parse(), |