diff options
-rw-r--r-- | .vscode/launch.json | 12 | ||||
-rwxr-xr-x | bin/reduceRangeDuplicates.js | 113 | ||||
-rw-r--r-- | test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson | 2 | ||||
-rw-r--r-- | test/fixtures/reduceRangeDuplicates/input1.geojson | 3 | ||||
-rw-r--r-- | test/reduceRangeDuplicates.js | 85 |
5 files changed, 156 insertions, 59 deletions
diff --git a/.vscode/launch.json b/.vscode/launch.json index 707d6d2..291dcce 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -61,8 +61,18 @@ "skipFiles": [ "<node_internals>/**" ], + "program": "${workspaceFolder}/node_modules/.bin/tape", + "args": ["test/reduceRangeDuplicates.js"] + }, + { + "type": "pwa-node", + "request": "launch", + "name": "bin/reduceRangeDuplicates fixture1", + "skipFiles": [ + "<node_internals>/**" + ], "program": "${workspaceFolder}/bin/reduceRangeDuplicates.js", - "args": ["/tmp/input_POZXt.geojson", "/tmp/output/0OIov.geojson"] + "args": ["test/fixtures/reduceRangeDuplicates/input1.geojson", "test/fixtures/reduceRangeDuplicates/actualOutput1.geojson"] }, { "type": "pwa-node", diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js index 6fac0b5..4ef57f9 100755 --- a/bin/reduceRangeDuplicates.js +++ b/bin/reduceRangeDuplicates.js @@ -41,6 +41,8 @@ if (!fs.existsSync(inputFile)) { process.exit(1) } +const intermediateFile = `${outputFile}-intermediate.json` + function hash(feature) { return [ feature.properties['addr:housenumber'], @@ -95,11 +97,13 @@ const index = new Transform({ const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/ /* -* second pass, filter A removes ranges where each endpoint of the range exists separately +* First pass, filter A removes ranges where each endpoint of the range exists separately * eg. * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists * - 304 Cardigan Street Calton * - 306 Cardigan Street Calton +* +* Conditional on the individual addresses not sharing the same geometry, if they do then they are dropped in favour of the range * * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range @@ -200,13 +204,25 @@ const reduceRange = new Transform({ } } - // can be removed, feature not pushed - if (argv.verbose) { - console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`) - } + // if matched start and end point have the same coordinates, then to avoid overlapping points, favour range so retain it + if (matchedStart.geometry.coordinates.join(',') === (matchedEnd.geometry.coordinates.join(','))) { + if (argv.verbose) { + console.log(`Filter A: ${feature.properties['addr:housenumber']} ${feature.properties['addr:street']} ${feature.properties['addr:suburb']} retained because while endpoints exist they share the same geometry`) + } + this.push(feature) + } else { + // can be removed, feature not pushed + if (argv.verbose) { + console.log(`Filter A: ${feature.properties['addr:housenumber']} ${feature.properties['addr:street']} ${feature.properties['addr:suburb']} can be removed`) + } + + // keep track of removed features for filter B, so we don't double remove both range and midpoints + rangesRemovedInFilterA[hash(feature)] = true - // keep track of removed features for filter B, so we don't double remove both range and midpoints - rangesRemovedInFilterA[hash(feature)] = true + if (argv.debug) { + debugStreams['filterA_dropRange'].write(feature) + } + } } else { // not both start and end found, // if one of start or end found and that start/end has addr:flats... @@ -216,6 +232,9 @@ const reduceRange = new Transform({ (matchedStart && matchedStart.properties['addr:flats']) || (matchedEnd && matchedEnd.properties['addr:flats']) )) { // drop the range, eg "112-116 Anderson Street, South Yarra" + if (argv.debug) { + debugStreams['filterA_dropRangeRangeNoFlatsNonRangeHasFlats'].write(feature) + } } else { // then still include the range this.push(feature) @@ -285,8 +304,17 @@ const reduceNonRange = new Transform({ // since the non-range feature has a unit that the range doesn't have, don't drop it dropFeature = false if (argv.debug) { - debugStreams['addrInRangeDifferentUnits'].write(feature) - debugStreams['addrInRangeDifferentUnits'].write(range) + debugStreams.addrInRangeDifferentUnits.write(feature) + debugStreams.addrInRangeDifferentUnits.write(range) + + debugStreams.addrInRangeDifferentUnits.write({ + type: 'Feature', + properties: feature.properties, + geometry: { + type: 'LineString', + coordinates: [feature.geometry.coordinates, range.geometry.coordinates] + } + }) } } } else { @@ -297,12 +325,16 @@ const reduceNonRange = new Transform({ } } } + if (!dropFeature) { this.push(feature) } else { if (argv.verbose) { console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`) } + if (argv.debug) { + debugStreams['filterB'].write(feature) + } } } else { this.push(feature) @@ -313,7 +345,7 @@ const reduceNonRange = new Transform({ }) // ndjson streams to output debug features -const debugKeys = ['addrInRangeDifferentUnits'] +const debugKeys = ['addrInRangeDifferentUnits', 'filterA_dropRangeRangeNoFlatsNonRangeHasFlats', 'filterA_dropRange', 'filterB'] const debugStreams = {} const debugStreamOutputs = {} @@ -335,39 +367,54 @@ pipeline( console.log(err) process.exit(1) } else { - // second pass to remove range duplicates - console.log('Pass 2/2: remove range duplicates') + // second pass to remove range duplicates part A + console.log('Pass 2/3: remove range duplicates part A ranges') pipeline( fs.createReadStream(inputFile), ndjson.parse(), reduceRange, - reduceNonRange, ndjson.stringify(), - fs.createWriteStream(outputFile), + fs.createWriteStream(intermediateFile), err => { if (err) { console.log(err) process.exit(1) } else { - if (argv.debug) { - debugKeys.forEach(key => { - debugStreams[key].end() - }) - - Promise.all(debugKeys.map(key => { - return new Promise(resolve => { - debugStreamOutputs[key].on('finish', () => { - console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`) - resolve() - }) - }) - })) - .then(() => { - process.exit(0) - }) - } else { - process.exit(0) - } + console.log('Pass 3/3: remove range duplicates part B endpoints') + pipeline( + fs.createReadStream(intermediateFile), + ndjson.parse(), + reduceNonRange, + ndjson.stringify(), + fs.createWriteStream(outputFile), + err => { + fs.unlinkSync(intermediateFile) + if (err) { + console.log(err) + process.exit(1) + } else { + if (argv.debug) { + debugKeys.forEach(key => { + debugStreams[key].end() + }) + + Promise.all(debugKeys.map(key => { + return new Promise(resolve => { + debugStreamOutputs[key].on('finish', () => { + console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`) + resolve() + }) + }) + })) + .then(() => { + process.exit(0) + }) + } else { + process.exit(0) + } + } + } + ) } } ) diff --git a/test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson b/test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson new file mode 100644 index 0000000..1e8f2c7 --- /dev/null +++ b/test/fixtures/reduceRangeDuplicates/expectedOutput1.geojson @@ -0,0 +1,2 @@ +{"type":"Feature","properties":{"_pfi":"50586965,50585085,425955908","addr:housenumber":"39","addr:street":"Linton Street","addr:suburb":"Balaclava","addr:state":"VIC","addr:postcode":"3183","addr:unit":null,"addr:flats":null,"addr:flats2":null,"addr:flats3":null,"addr:flats4":null,"addr:flats5":null,"addr:flats6":null,"addr:flats7":null},"geometry":{"type":"Point","coordinates":[144.9924932,-37.8669844]}} +{"type":"Feature","properties":{"_pfi":"50585086,425955907","addr:housenumber":"41","addr:street":"Linton Street","addr:suburb":"Balaclava","addr:state":"VIC","addr:postcode":"3183","addr:unit":null,"addr:flats":null,"addr:flats2":null,"addr:flats3":null,"addr:flats4":null,"addr:flats5":null,"addr:flats6":null,"addr:flats7":null},"geometry":{"type":"Point","coordinates":[144.992463,-37.8670328]}} diff --git a/test/fixtures/reduceRangeDuplicates/input1.geojson b/test/fixtures/reduceRangeDuplicates/input1.geojson new file mode 100644 index 0000000..d91155d --- /dev/null +++ b/test/fixtures/reduceRangeDuplicates/input1.geojson @@ -0,0 +1,3 @@ +{ "type": "Feature", "properties": { "_pfi": "50586965,50585085,425955908", "addr:housenumber": "39", "addr:street": "Linton Street", "addr:suburb": "Balaclava", "addr:state": "VIC", "addr:postcode": "3183", "addr:unit": null, "addr:flats": null, "addr:flats2": null, "addr:flats3": null, "addr:flats4": null, "addr:flats5": null, "addr:flats6": null, "addr:flats7": null }, "geometry": { "type": "Point", "coordinates": [ 144.9924932, -37.8669844 ] } } +{ "type": "Feature", "properties": { "_pfi": "206441264", "addr:housenumber": "39-41", "addr:street": "Linton Street", "addr:suburb": "Balaclava", "addr:state": "VIC", "addr:postcode": "3183", "addr:unit": null, "addr:flats": null, "addr:flats2": null, "addr:flats3": null, "addr:flats4": null, "addr:flats5": null, "addr:flats6": null, "addr:flats7": null }, "geometry": { "type": "Point", "coordinates": [ 144.9925274, -37.8670149 ] } } +{ "type": "Feature", "properties": { "_pfi": "50585086,425955907", "addr:housenumber": "41", "addr:street": "Linton Street", "addr:suburb": "Balaclava", "addr:state": "VIC", "addr:postcode": "3183", "addr:unit": null, "addr:flats": null, "addr:flats2": null, "addr:flats3": null, "addr:flats4": null, "addr:flats5": null, "addr:flats6": null, "addr:flats7": null }, "geometry": { "type": "Point", "coordinates": [ 144.992463, -37.8670328 ] } } diff --git a/test/reduceRangeDuplicates.js b/test/reduceRangeDuplicates.js index 091390c..21c80c4 100644 --- a/test/reduceRangeDuplicates.js +++ b/test/reduceRangeDuplicates.js @@ -3,7 +3,7 @@ const fs = require('fs') const child_process = require('child_process') const mktemp = require('mktemp') -function createFeature(unit, housenumber, street, suburb, flats) { +function createFeature(coordinates, unit, housenumber, street, suburb, flats) { return { type: 'Feature', properties: { @@ -15,18 +15,21 @@ function createFeature(unit, housenumber, street, suburb, flats) { 'addr:state': 'VIC', 'addr:postcode': '0000' }, - geometry: null + geometry: coordinates ? { + type: 'Point', + coordinates: coordinates + } : null } } -test('reduceRangeDuplicates', t => { +test('reduceRangeDuplicates distinct geometries', t => { const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson') const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') - const AB = createFeature(null, '304-306', 'Cardigan Street', 'Carlton') - const A = createFeature(null, '304', 'Cardigan Street', 'Carlton') - const B = createFeature(null, '306', 'Cardigan Street', 'Carlton') + const AB = createFeature([0, 0], null, '304-306', 'Cardigan Street', 'Carlton') + const A = createFeature([-1, 0], null, '304', 'Cardigan Street', 'Carlton') + const B = createFeature([1, 0], null, '306', 'Cardigan Street', 'Carlton') // all three features to appear in input fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n') @@ -37,11 +40,11 @@ test('reduceRangeDuplicates', t => { fs.appendFileSync(expectedFile, JSON.stringify(A) + '\n') fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n') - child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`) + child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`) t.same( - fs.readFileSync(outputFile), - fs.readFileSync(expectedFile), + fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse), + fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse), 'range with endpoints appearing separately, drops range' ) @@ -52,13 +55,45 @@ test('reduceRangeDuplicates', t => { t.end() }) +test('reduceRangeDuplicates overlapping geometries', t => { + const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson') + const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') + const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') + + const AB = createFeature([0, 0], null, '304-306', 'Cardigan Street', 'Carlton') + const A = createFeature([0, 0], null, '304', 'Cardigan Street', 'Carlton') + const B = createFeature([0, 0], null, '306', 'Cardigan Street', 'Carlton') + + // all three features to appear in input + fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n') + fs.appendFileSync(inputFile, JSON.stringify(A) + '\n') + fs.appendFileSync(inputFile, JSON.stringify(B) + '\n') + + // output expected to drop the endpoints and retain the range since endpoints are overlapping + fs.appendFileSync(expectedFile, JSON.stringify(AB) + '\n') + + child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`) + + t.same( + fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse), + fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse), + 'range with endpoints appearing separately but overlapping, drops the endpoints' + ) + + fs.unlinkSync(inputFile) + fs.unlinkSync(outputFile) + fs.unlinkSync(expectedFile) + + t.end() +}) + test('reduceRangeDuplicates', t => { const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson') const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') - const AC = createFeature(null, '249-263', 'Faraday Street', 'Carlton') - const B = createFeature(null, '251', 'Faraday Street', 'Carlton') + const AC = createFeature(null, null, '249-263', 'Faraday Street', 'Carlton') + const B = createFeature(null, null, '251', 'Faraday Street', 'Carlton') // both features to appear in input fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n') @@ -67,11 +102,11 @@ test('reduceRangeDuplicates', t => { // output expected to just be range, dropping the midpoint fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n') - child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`) + child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`) t.same( - fs.readFileSync(outputFile), - fs.readFileSync(expectedFile), + fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse), + fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse), 'range with lone midpoint, drops midpoint' ) @@ -87,8 +122,8 @@ test('reduceRangeDuplicates', t => { const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') - const AC = createFeature(null, '249-263', 'Faraday Street', 'Carlton') - const B = createFeature('1', '251', 'Faraday Street', 'Carlton') + const AC = createFeature(null, null, '249-263', 'Faraday Street', 'Carlton') + const B = createFeature(null, '1', '251', 'Faraday Street', 'Carlton') // both features to appear in input fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n') @@ -98,11 +133,11 @@ test('reduceRangeDuplicates', t => { fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n') fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n') - child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`) + child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`) t.same( - fs.readFileSync(outputFile), - fs.readFileSync(expectedFile), + fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse), + fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse), 'midpoint with unit not dropped' ) @@ -118,22 +153,22 @@ test('reduceRangeDuplicates', t => { const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson') const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson') - const AC = createFeature(null, '249-263', 'Faraday Street', 'Carlton') - const B = createFeature(null, '251', 'Faraday Street', 'Carlton', '1;2;3') + const AC = createFeature(null, null, '249-263', 'Faraday Street', 'Carlton') + const B = createFeature(null, null, '251', 'Faraday Street', 'Carlton', '1;2;3') // both features to appear in input fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n') fs.appendFileSync(inputFile, JSON.stringify(B) + '\n') - // output expected to both features because dropping the midpoint would loose the unit + // output expected to both features because dropping the midpoint would loose the flats fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n') fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n') - child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`) + child_process.execSync(`./bin/reduceRangeDuplicates.js --verbose ${inputFile} ${outputFile}`) t.same( - fs.readFileSync(outputFile), - fs.readFileSync(expectedFile), + fs.readFileSync(outputFile, 'utf-8').trim().split('\n').map(JSON.parse), + fs.readFileSync(expectedFile, 'utf-8').trim().split('\n').map(JSON.parse), 'midpoint with flats not dropped' ) |