diff options
-rwxr-xr-x | bin/reduceRangeDuplicates.js | 59 | ||||
-rw-r--r-- | package.json | 3 | ||||
-rw-r--r-- | test/reduceRangeDuplicates.js | 81 | ||||
-rw-r--r-- | yarn.lock | 5 |
4 files changed, 135 insertions, 13 deletions
diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js index 32155a4..b719124 100755 --- a/bin/reduceRangeDuplicates.js +++ b/bin/reduceRangeDuplicates.js @@ -22,6 +22,10 @@ const argv = require('yargs/yargs')(process.argv.slice(2)) type: 'boolean', description: 'Dumps full debug logs' }) + .option('verbose', { + type: 'boolean', + description: 'Verbose logging' + }) .argv if (argv._.length < 2) { @@ -37,10 +41,21 @@ if (!fs.existsSync(inputFile)) { process.exit(1) } +function hash(feature) { + return [ + feature.properties['addr:housenumber'], + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') +} + let sourceCount = 0 const ranges = [] const nonRangesByStreet = {} +const rangesRemovedInFilterA = {} // index all non-range addresses by street, suburb, state, postcode const index = new Transform({ @@ -78,7 +93,7 @@ const index = new Transform({ const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/ /* -* First pass removes ranges where each endpoint of the range exists seperatly +* second pass, filter A removes ranges where each endpoint of the range exists separately * eg. * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists * - 304 Cardigan Street Calton @@ -98,9 +113,8 @@ const reduceRange = new Transform({ } const isRange = feature.properties['addr:housenumber'].split('-').length > 1 - if (isRange) { - // see if it can be removed when each end point of the range is included seperatly + // see if it can be removed when each end point of the range is included separately const start = feature.properties['addr:housenumber'].split('-')[0] const end = feature.properties['addr:housenumber'].split('-')[1] @@ -123,8 +137,8 @@ const reduceRange = new Transform({ let pre = '' let suf = '' - matchCandidates.map(matchCandidate => { - if (start === matchCandidate.properties['addr:housenumber']) { + for (const matchCandidate of matchCandidates) { + if (!foundStart && start === matchCandidate.properties['addr:housenumber']) { foundStart = true const match = start.match(regexp) @@ -132,13 +146,18 @@ const reduceRange = new Transform({ pre = match.groups.pre suf = match.groups.suf } - if (end === matchCandidate.properties['addr:housenumber']) { + if (!foundEnd && end === matchCandidate.properties['addr:housenumber']) { foundEnd = true const match = end.match(regexp) endNum = match.groups.num } - }) + + if (foundStart && foundEnd) { + // stop early + break + } + } if (foundStart && foundEnd) { // found both start and end @@ -160,10 +179,18 @@ const reduceRange = new Transform({ if (!foundAllIntermediates) { // some intermediates were missing // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results - console.log('found endpoints but some intermediates are missing', feature) + if (argv.verbose) { + console.log('Filter A: Found endpoints but some intermediates are missing', feature) + } } // can be removed, feature not pushed + if (argv.verbose) { + console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`) + } + + // keep track of removed features for filter B, so we don't double remove both range and midpoints + rangesRemovedInFilterA[hash(feature)] = true } else { // since not both start and end found, then still include the range this.push(feature) @@ -183,7 +210,7 @@ const reduceRange = new Transform({ }) /* -* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass +* Second pass, filter B removes any non-range elements where the range exists, and wasn't removed from the first pass * eg. * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range @@ -201,11 +228,12 @@ const reduceNonRange = new Transform({ const isRange = feature.properties['addr:housenumber'].split('-').length > 1 if (!isRange) { - // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already + // not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already let dropFeature = false for (let i = 0; i < ranges.length; i++) { const range = ranges[i] - if (withinRange(feature, range)) { + // if the range wasn't just removed in filter A, and the feature is within the range + if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) { // found within a range, drop feature unless would drop addr:unit information if ('addr:unit' in feature.properties) { // safe to drop if the same addr:unit is also on the range @@ -227,7 +255,13 @@ const reduceNonRange = new Transform({ } if (!dropFeature) { this.push(feature) + } else { + if (argv.verbose) { + console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`) + } } + } else { + this.push(feature) } callback() @@ -257,7 +291,8 @@ pipeline( console.log(err) process.exit(1) } else { - // second pass to reduce overlapping features + console.log('Second pass to remove range duplicates') + // second pass to remove range duplicates pipeline( fs.createReadStream(inputFile), ndjson.parse(), diff --git a/package.json b/package.json index 388427e..f13575e 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ "author": "Andrew Harvey <andrew@alantgeo.com.au>", "license": "MIT", "scripts": { - "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js test/valueLimits.js" + "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js test/valueLimits.js test/reduceRangeDuplicates.js" }, "dependencies": { "capital-case": "^1.0.4", @@ -14,6 +14,7 @@ "clone-deep": "^4.0.1", "flatbush": "^3.3.0", "geoflatbush": "^1.0.0", + "mktemp": "^1.0.0", "ndjson": "^2.0.0", "readable-stream": "^3.6.0", "tape": "^5.2.2", diff --git a/test/reduceRangeDuplicates.js b/test/reduceRangeDuplicates.js new file mode 100644 index 0000000..e180841 --- /dev/null +++ b/test/reduceRangeDuplicates.js @@ -0,0 +1,81 @@ +const test = require('tape') +const fs = require('fs') +const child_process = require('child_process') +const mktemp = require('mktemp') + +function createFeature(housenumber, street, suburb) { + return { + type: 'Feature', + properties: { + 'addr:housenumber': housenumber, + 'addr:street': street, + 'addr:suburb': suburb, + 'addr:state': 'VIC', + 'addr:postcode': '0000' + }, + geometry: null + } +} + +test('reduceRangeDuplicates', t => { + const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson') + const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') + const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') + + const AB = createFeature('304-306', 'Cardigan Street', 'Carlton') + const A = createFeature('304', 'Cardigan Street', 'Carlton') + const B = createFeature('306', 'Cardigan Street', 'Carlton') + + // all three features to appear in input + fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n') + fs.appendFileSync(inputFile, JSON.stringify(A) + '\n') + fs.appendFileSync(inputFile, JSON.stringify(B) + '\n') + + // output expected to just be endpoints, dropping the range + fs.appendFileSync(expectedFile, JSON.stringify(A) + '\n') + fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n') + + child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`) + + t.same( + fs.readFileSync(outputFile), + fs.readFileSync(expectedFile), + 'range with endpoints appearing separately, drops range' + ) + + fs.unlinkSync(inputFile) + fs.unlinkSync(outputFile) + fs.unlinkSync(expectedFile) + + t.end() +}) + +test('reduceRangeDuplicates', t => { + const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson') + const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') + const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') + + const AC = createFeature('249-263', 'Faraday Street', 'Carlton') + const B = createFeature('251', 'Faraday Street', 'Carlton') + + // both features to appear in input + fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n') + fs.appendFileSync(inputFile, JSON.stringify(B) + '\n') + + // output expected to just be range, dropping the midpoint + fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n') + + child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`) + + t.same( + fs.readFileSync(outputFile), + fs.readFileSync(expectedFile), + 'range with lone midpoint, drops midpoint' + ) + + fs.unlinkSync(inputFile) + fs.unlinkSync(outputFile) + fs.unlinkSync(expectedFile) + + t.end() +}) @@ -436,6 +436,11 @@ minimist@^1.2.5: resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602" integrity sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw== +mktemp@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/mktemp/-/mktemp-1.0.0.tgz#b670eff23f52d6529e1dc362cb74ddf85448a9e3" + integrity sha512-2duBeS0A75x0M3sCoY0R1TiLsYfIBUtNBNWS++eo+bX/ObVqzblqnEQhlaepoBOLD14wklsV3cYxZ68o5qYO8A== + ndjson@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/ndjson/-/ndjson-2.0.0.tgz#320ac86f6fe53f5681897349b86ac6f43bfa3a19" |