diff options
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rwxr-xr-x | bin/reduceDuplicates.js | 169 |
3 files changed, 123 insertions, 52 deletions
@@ -49,9 +49,9 @@ dist/vicmap-osm-with-suburb.geojson: data/vicmap.geojson dist/vicmap-osm.mbtiles: dist/vicmap-osm.geojson tippecanoe --force -o $@ --minimum-zoom=12 --maximum-zoom=12 --no-feature-limit --no-tile-size-limit --no-tile-stats --read-parallel $< -dist/vicmap-osm-uniq.geojson: dist/vicmap-osm-with-suburb.geojson +dist/vicmap-osm-uniq.geojson: dist/vicmap-osm-with-suburb.geojson data/victoria-addr.osm.geojson mkdir -p debug/reduceDuplicates - node --max_old_space_size=4096 ./bin/reduceDuplicates.js --debug $< $@ + node --max_old_space_size=4096 ./bin/reduceDuplicates.js --debug $^ $@ dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson mkdir -p debug/reduceOverlap @@ -46,7 +46,7 @@ Two debug outputs are produced from this step. 1. _singleCluster_ - visualises where all addresses with the same address properties are combined into a single "cluster" based on a 25 meter maximum threshold distance. In this case it's safe to reduce all the points into a single centroid point. -2. _multiCluster_ - visualises where all addresses with the same address properties exceed the 25 meter cluster threshold and are unable to be reduced to a single point. These are not included in the import and need to be reviewed for manual import. A MapRoulette challenge is outputted at `debug/reduceDuplicates/mr_duplicateAddressFarApart.geojson`, however because this is before the conflation stage, many of these may already exist in OSM. It's a TODO for these to be conflated so that only missing from OSM addresses are asked to be checked in MapRoulette. +2. _multiCluster_ - visualises where all addresses with the same address properties exceed the 25 meter cluster threshold and are unable to be reduced to a single point. These are not included in the import and need to be reviewed for manual import. A MapRoulette challenge is outputted at `debug/reduceDuplicates/mr_duplicateAddressFarApart.geojson`, which includes those missing from OSM with a rough conflation pass.  diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js index 0abef54..3c5a7ee 100755 --- a/bin/reduceDuplicates.js +++ b/bin/reduceDuplicates.js @@ -11,6 +11,8 @@ const cluster = require('../lib/cluster.js') const cloneDeep = require('clone-deep') const xml = require('xml-js') const _ = require('lodash') +const { default: centroid } = require('@turf/centroid') +const { default: distance } = require('@turf/distance') const argv = require('yargs/yargs')(process.argv.slice(2)) .option('debug', { @@ -20,18 +22,52 @@ const argv = require('yargs/yargs')(process.argv.slice(2)) .argv if (argv._.length < 2) { - console.error("Usage: ./reduceDuplicates.js input.geojson output.geojson") + console.error("Usage: ./reduceDuplicates.js input.geojson osmFile.geojson output.geojson") process.exit(1) } const inputFile = argv._[0] -const outputFile = argv._[1] +const osmFile = argv._[1] +const outputFile = argv._[2] if (!fs.existsSync(inputFile)) { console.error(`${inputFile} not found`) process.exit(1) } +if (!fs.existsSync(osmFile)) { + console.error(`${osmFile} not found`) + process.exit(1) +} + +const osmAddressKeys = {} + +let osmAddrCount = 0 +const indexOSM = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + osmAddrCount++ + + if (process.stdout.isTTY && osmAddrCount % 10000 === 0) { + process.stdout.write(` ${osmAddrCount.toLocaleString()}\r`) + } + + if (feature && feature.properties) { + const key = [ + feature.properties['addr:housenumber'], + feature.properties['addr:street'] + ].join('|') + if (!(key in osmAddressKeys)) { + osmAddressKeys[key] = [] + } + osmAddressKeys[key].push(centroid(feature)) + } + + callback() + } +}) + let sourceCount = 0 const features = {} @@ -182,25 +218,47 @@ const reduce = new Transform({ debugStreams.multiCluster.write(webOfMatches) // output as a MapRoulette task - const task = { - type: 'FeatureCollection', - features: [ - ...groupedFeatures - ], - cooperativeWork: { - meta: { - version: 2, - type: 2 - }, - file: { - type: 'xml', - format: 'osc', - encoding: 'base64', - content: Buffer.from(featureToOsc(groupedFeatures[0])).toString('base64') // the base64-encoded osc file + const firstGroupedFeature = groupedFeatures[0] + const firstGroupedFeatureKey = [ + firstGroupedFeature.properties['addr:housenumber'], + firstGroupedFeature.properties['addr:street'] + ].join('|') + + let foundInOSM = false + if (firstGroupedFeatureKey in osmAddressKeys) { + // already found in OSM skipping + const closestDistance = osmAddressKeys[firstGroupedFeatureKey].map(osm => { + return distance(osm, centroid(firstGroupedFeature)) + }) + .sort((a, b) => b - a) + .pop() + + if (closestDistance < 50) { + foundInOSM = true + } + } + if (!foundInOSM) { + // output + const task = { + type: 'FeatureCollection', + features: [ + ...groupedFeatures + ], + cooperativeWork: { + meta: { + version: 2, + type: 2 + }, + file: { + type: 'xml', + format: 'osc', + encoding: 'base64', + content: Buffer.from(featureToOsc(groupedFeatures[0])).toString('base64') // the base64-encoded osc file + } } } + debugStreams.mr_duplicateAddressFarApart.write(task) } - debugStreams.mr_duplicateAddressFarApart.write(task) } } } @@ -267,52 +325,65 @@ if (argv.debug) { }) } -// first pass to index by geometry -console.log('Pass 1/2: index by address properties') +// first pass to index existing OSM addresses +console.log('Pass 1/3: Store existing OSM addresses') pipeline( - fs.createReadStream(inputFile), + fs.createReadStream(osmFile), ndjson.parse(), - index, + indexOSM, err => { if (err) { console.log(err) process.exit(1) } else { - console.log(` of ${sourceCount.toLocaleString()} features found ${Object.keys(features).length.toLocaleString()} unique addresses`) - // second pass to reduce duplicate features - console.log('Pass 2/2: reduce duplicate features') + // second pass to index by geometry + console.log('Pass 2/3: index by address properties') pipeline( - Readable.from(Object.keys(features)), - reduce, - ndjson.stringify(), - fs.createWriteStream(outputFile), + fs.createReadStream(inputFile), + ndjson.parse(), + index, err => { if (err) { console.log(err) process.exit(1) } else { - if (argv.debug) { - debugKeys.forEach(key => { - debugStreams[key].end() - }) + console.log(` of ${sourceCount.toLocaleString()} features found ${Object.keys(features).length.toLocaleString()} unique addresses`) + // third pass to reduce duplicate features + console.log('Pass 3/3: reduce duplicate features') + pipeline( + Readable.from(Object.keys(features)), + reduce, + ndjson.stringify(), + fs.createWriteStream(outputFile), + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + if (argv.debug) { + debugKeys.forEach(key => { + debugStreams[key].end() + }) - Promise.all(debugKeys.map(key => { - return new Promise(resolve => { - debugStreamOutputs[key].on('finish', () => { - console.log(`saved debug/reduceDuplicates/${key}.geojson`) - resolve() - }) - }) - })) - .then(() => { - process.exit(0) - }) - } else { - process.exit(0) - } + Promise.all(debugKeys.map(key => { + return new Promise(resolve => { + debugStreamOutputs[key].on('finish', () => { + console.log(`saved debug/reduceDuplicates/${key}.geojson`) + resolve() + }) + }) + })) + .then(() => { + process.exit(0) + }) + } else { + process.exit(0) + } + } + } + ) } } ) } - } -) + }) |