aboutsummaryrefslogtreecommitdiff
path: root/bin/reduceDuplicates.js
diff options
context:
space:
mode:
Diffstat (limited to 'bin/reduceDuplicates.js')
-rwxr-xr-xbin/reduceDuplicates.js91
1 files changed, 72 insertions, 19 deletions
diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js
index 542b43f..abd5810 100755
--- a/bin/reduceDuplicates.js
+++ b/bin/reduceDuplicates.js
@@ -8,6 +8,7 @@ const fs = require('fs')
const { Readable, Transform, pipeline } = require('stream')
const ndjson = require('ndjson')
const cluster = require('../lib/cluster.js')
+const cloneDeep = require('clone-deep')
const argv = require('yargs/yargs')(process.argv.slice(2))
.option('debug', {
@@ -32,6 +33,7 @@ if (!fs.existsSync(inputFile)) {
let sourceCount = 0
const features = {}
+// index features by properties
const index = new Transform({
readableObjectMode: true,
writableObjectMode: true,
@@ -43,6 +45,7 @@ const index = new Transform({
}
const key = [
+ feature.properties['addr:unit:prefix'],
feature.properties['addr:unit'],
feature.properties['addr:housenumber'],
feature.properties['addr:street'],
@@ -60,6 +63,7 @@ const index = new Transform({
}
})
+// remove duplicates
let reduceIndex = 0
const reduce = new Transform({
readableObjectMode: true,
@@ -80,24 +84,55 @@ const reduce = new Transform({
const sameCoordinates = [...new Set(groupedFeatures.map(f => f.geometry.coordinates.join(',')))].length <= 1
if (sameCoordinates) {
- // features have same properties and same geometry, so true duplicates can reduce to one
+ // features have same properties and same geometry, so they are true duplicates which can safely be reduced to one
this.push(groupedFeatures[0])
} else {
+ // features have same properties but not all with the same geometry
+
// cluster features with a threshold of 25m
const clusters = cluster(groupedFeatures, 25)
// if clustered into a single cluster, then output a single average feature
+ // this should be safe to use as within 25m
if (clusters.length === 1) {
const averageCoordinates = [
groupedFeatures.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length,
groupedFeatures.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length
]
- const averageFeature = groupedFeatures[0]
+ const averageFeature = cloneDeep(groupedFeatures[0])
averageFeature.geometry.coordinates = averageCoordinates
+ if (argv.debug) {
+ // create a spider web to illustrate which features were clustered together and where the average point is
+ const spiderWebCoordinates = []
+
+ debugStreams.singleCluster.write(averageFeature)
+ groupedFeatures.forEach(feature => {
+ // debugStreams.singleCluster.write(feature)
+
+ // start with the average point
+ spiderWebCoordinates.push(averageFeature.geometry.coordinates)
+ // go out to the source point
+ spiderWebCoordinates.push(feature.geometry.coordinates)
+ // end back at the average point
+ spiderWebCoordinates.push(averageFeature.geometry.coordinates)
+ })
+
+ // output a web connecting the source points for visualisation
+ debugStreams.singleCluster.write({
+ type: 'Feature',
+ properties: Object.assign({ '_type': 'Single Cluster' }, averageFeature.properties),
+ geometry: {
+ type: 'LineString',
+ coordinates: spiderWebCoordinates
+ }
+ })
+ }
+
this.push(averageFeature)
} else {
- // more than one cluster, reduce those clustered into one, and then report all the results
+ // more than one cluster, reduce those clustered into centroids, and then report all the centroids
+ // these will need to be manually reviewed
const clusterAverages = clusters.map(cluster => {
if (cluster.length === 1) {
return cluster[0]
@@ -106,23 +141,28 @@ const reduce = new Transform({
cluster.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / cluster.length,
cluster.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / cluster.length
]
- const averageFeature = cluster[0]
+ const averageFeature = cloneDeep(cluster[0])
averageFeature.geometry.coordinates = averageCoordinates
return averageFeature
}
})
- // report these as address points with the same attributes but different locations beyond the threshold
- if (debugDuplicateAddressStream) {
+ // report these as address points with the same attributes but different locations beyond the cluster threshold
+ if (argv.debug) {
const webOfMatches = {
type: 'Feature',
- properties: clusterAverages[0].properties,
+ properties: Object.assign({ '_type': 'Multi Cluster' }, clusterAverages[0].properties),
geometry: {
type: 'LineString',
coordinates: clusterAverages.map(p => p.geometry.coordinates)
}
}
- debugDuplicateAddressStream.write(webOfMatches)
+ clusterAverages.forEach(feature => {
+ // output candidate feature
+ debugStreams.multiCluster.write(feature)
+ })
+ // output a web connecting the canidates for visualisation
+ debugStreams.multiCluster.write(webOfMatches)
}
}
}
@@ -132,11 +172,16 @@ const reduce = new Transform({
}
})
-const debugDuplicateAddressStream = argv.debug ? ndjson.stringify() : null
+// ndjson streams to output debug features
+const debugKeys = ['singleCluster', 'multiCluster']
+const debugStreams = {}
+const debugStreamOutputs = {}
-let debugApplicationsAddressStreamOutput
-if (debugDuplicateAddressStream) {
- debugApplicationsAddressStreamOutput = debugDuplicateAddressStream.pipe(fs.createWriteStream('debug/reduceDuplicates/duplicateAddresses.geojson'))
+if (argv.debug) {
+ debugKeys.forEach(key => {
+ debugStreams[key] = ndjson.stringify()
+ debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceDuplicates/${key}.geojson`))
+ })
}
// first pass to index by geometry
@@ -162,14 +207,22 @@ pipeline(
console.log(err)
process.exit(1)
} else {
- if (debugDuplicateAddressStream) {
- debugDuplicateAddressStream.end()
- }
- if (debugApplicationsAddressStreamOutput) {
- debugApplicationsAddressStreamOutput.on('finish', () => {
- console.log('saved debug/reduceDuplicates/duplicateAddresses.geojson')
- process.exit(0)
+ if (argv.debug) {
+ debugKeys.forEach(key => {
+ debugStreams[key].end()
})
+
+ Promise.all(debugKeys.map(key => {
+ return new Promise(resolve => {
+ debugStreamOutputs[key].on('finish', () => {
+ console.log(`saved debug/reduceDuplicates/${key}.geojson`)
+ resolve()
+ })
+ })
+ }))
+ .then(() => {
+ process.exit(0)
+ })
} else {
process.exit(0)
}