From 43d2090893aeeb8645e0e9c6ca05d5f96e48aae0 Mon Sep 17 00:00:00 2001 From: Andrew Harvey Date: Wed, 5 May 2021 15:21:51 +1000 Subject: clean up debug logging, corner cases, and set addr:unit:prefix --- bin/reduceDuplicates.js | 91 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 19 deletions(-) (limited to 'bin/reduceDuplicates.js') diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js index 542b43f..abd5810 100755 --- a/bin/reduceDuplicates.js +++ b/bin/reduceDuplicates.js @@ -8,6 +8,7 @@ const fs = require('fs') const { Readable, Transform, pipeline } = require('stream') const ndjson = require('ndjson') const cluster = require('../lib/cluster.js') +const cloneDeep = require('clone-deep') const argv = require('yargs/yargs')(process.argv.slice(2)) .option('debug', { @@ -32,6 +33,7 @@ if (!fs.existsSync(inputFile)) { let sourceCount = 0 const features = {} +// index features by properties const index = new Transform({ readableObjectMode: true, writableObjectMode: true, @@ -43,6 +45,7 @@ const index = new Transform({ } const key = [ + feature.properties['addr:unit:prefix'], feature.properties['addr:unit'], feature.properties['addr:housenumber'], feature.properties['addr:street'], @@ -60,6 +63,7 @@ const index = new Transform({ } }) +// remove duplicates let reduceIndex = 0 const reduce = new Transform({ readableObjectMode: true, @@ -80,24 +84,55 @@ const reduce = new Transform({ const sameCoordinates = [...new Set(groupedFeatures.map(f => f.geometry.coordinates.join(',')))].length <= 1 if (sameCoordinates) { - // features have same properties and same geometry, so true duplicates can reduce to one + // features have same properties and same geometry, so they are true duplicates which can safely be reduced to one this.push(groupedFeatures[0]) } else { + // features have same properties but not all with the same geometry + // cluster features with a threshold of 25m const clusters = cluster(groupedFeatures, 25) // if clustered into a single cluster, then output a single average feature + // this should be safe to use as within 25m if (clusters.length === 1) { const averageCoordinates = [ groupedFeatures.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length, groupedFeatures.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length ] - const averageFeature = groupedFeatures[0] + const averageFeature = cloneDeep(groupedFeatures[0]) averageFeature.geometry.coordinates = averageCoordinates + if (argv.debug) { + // create a spider web to illustrate which features were clustered together and where the average point is + const spiderWebCoordinates = [] + + debugStreams.singleCluster.write(averageFeature) + groupedFeatures.forEach(feature => { + // debugStreams.singleCluster.write(feature) + + // start with the average point + spiderWebCoordinates.push(averageFeature.geometry.coordinates) + // go out to the source point + spiderWebCoordinates.push(feature.geometry.coordinates) + // end back at the average point + spiderWebCoordinates.push(averageFeature.geometry.coordinates) + }) + + // output a web connecting the source points for visualisation + debugStreams.singleCluster.write({ + type: 'Feature', + properties: Object.assign({ '_type': 'Single Cluster' }, averageFeature.properties), + geometry: { + type: 'LineString', + coordinates: spiderWebCoordinates + } + }) + } + this.push(averageFeature) } else { - // more than one cluster, reduce those clustered into one, and then report all the results + // more than one cluster, reduce those clustered into centroids, and then report all the centroids + // these will need to be manually reviewed const clusterAverages = clusters.map(cluster => { if (cluster.length === 1) { return cluster[0] @@ -106,23 +141,28 @@ const reduce = new Transform({ cluster.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / cluster.length, cluster.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / cluster.length ] - const averageFeature = cluster[0] + const averageFeature = cloneDeep(cluster[0]) averageFeature.geometry.coordinates = averageCoordinates return averageFeature } }) - // report these as address points with the same attributes but different locations beyond the threshold - if (debugDuplicateAddressStream) { + // report these as address points with the same attributes but different locations beyond the cluster threshold + if (argv.debug) { const webOfMatches = { type: 'Feature', - properties: clusterAverages[0].properties, + properties: Object.assign({ '_type': 'Multi Cluster' }, clusterAverages[0].properties), geometry: { type: 'LineString', coordinates: clusterAverages.map(p => p.geometry.coordinates) } } - debugDuplicateAddressStream.write(webOfMatches) + clusterAverages.forEach(feature => { + // output candidate feature + debugStreams.multiCluster.write(feature) + }) + // output a web connecting the canidates for visualisation + debugStreams.multiCluster.write(webOfMatches) } } } @@ -132,11 +172,16 @@ const reduce = new Transform({ } }) -const debugDuplicateAddressStream = argv.debug ? ndjson.stringify() : null +// ndjson streams to output debug features +const debugKeys = ['singleCluster', 'multiCluster'] +const debugStreams = {} +const debugStreamOutputs = {} -let debugApplicationsAddressStreamOutput -if (debugDuplicateAddressStream) { - debugApplicationsAddressStreamOutput = debugDuplicateAddressStream.pipe(fs.createWriteStream('debug/reduceDuplicates/duplicateAddresses.geojson')) +if (argv.debug) { + debugKeys.forEach(key => { + debugStreams[key] = ndjson.stringify() + debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceDuplicates/${key}.geojson`)) + }) } // first pass to index by geometry @@ -162,14 +207,22 @@ pipeline( console.log(err) process.exit(1) } else { - if (debugDuplicateAddressStream) { - debugDuplicateAddressStream.end() - } - if (debugApplicationsAddressStreamOutput) { - debugApplicationsAddressStreamOutput.on('finish', () => { - console.log('saved debug/reduceDuplicates/duplicateAddresses.geojson') - process.exit(0) + if (argv.debug) { + debugKeys.forEach(key => { + debugStreams[key].end() }) + + Promise.all(debugKeys.map(key => { + return new Promise(resolve => { + debugStreamOutputs[key].on('finish', () => { + console.log(`saved debug/reduceDuplicates/${key}.geojson`) + resolve() + }) + }) + })) + .then(() => { + process.exit(0) + }) } else { process.exit(0) } -- cgit v1.2.3