aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--README.md2
-rwxr-xr-xbin/reduceDuplicates.js169
3 files changed, 123 insertions, 52 deletions
diff --git a/Makefile b/Makefile
index 0192367..d8ffc67 100644
--- a/Makefile
+++ b/Makefile
@@ -49,9 +49,9 @@ dist/vicmap-osm-with-suburb.geojson: data/vicmap.geojson
dist/vicmap-osm.mbtiles: dist/vicmap-osm.geojson
tippecanoe --force -o $@ --minimum-zoom=12 --maximum-zoom=12 --no-feature-limit --no-tile-size-limit --no-tile-stats --read-parallel $<
-dist/vicmap-osm-uniq.geojson: dist/vicmap-osm-with-suburb.geojson
+dist/vicmap-osm-uniq.geojson: dist/vicmap-osm-with-suburb.geojson data/victoria-addr.osm.geojson
mkdir -p debug/reduceDuplicates
- node --max_old_space_size=4096 ./bin/reduceDuplicates.js --debug $< $@
+ node --max_old_space_size=4096 ./bin/reduceDuplicates.js --debug $^ $@
dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson
mkdir -p debug/reduceOverlap
diff --git a/README.md b/README.md
index 1ef5e39..0b37c1f 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Two debug outputs are produced from this step.
1. _singleCluster_ - visualises where all addresses with the same address properties are combined into a single "cluster" based on a 25 meter maximum threshold distance. In this case it's safe to reduce all the points into a single centroid point.
-2. _multiCluster_ - visualises where all addresses with the same address properties exceed the 25 meter cluster threshold and are unable to be reduced to a single point. These are not included in the import and need to be reviewed for manual import. A MapRoulette challenge is outputted at `debug/reduceDuplicates/mr_duplicateAddressFarApart.geojson`, however because this is before the conflation stage, many of these may already exist in OSM. It's a TODO for these to be conflated so that only missing from OSM addresses are asked to be checked in MapRoulette.
+2. _multiCluster_ - visualises where all addresses with the same address properties exceed the 25 meter cluster threshold and are unable to be reduced to a single point. These are not included in the import and need to be reviewed for manual import. A MapRoulette challenge is outputted at `debug/reduceDuplicates/mr_duplicateAddressFarApart.geojson`, which includes those missing from OSM with a rough conflation pass.
![multiCluster example](img/reduceDuplicates_multiCluster.png)
diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js
index 0abef54..3c5a7ee 100755
--- a/bin/reduceDuplicates.js
+++ b/bin/reduceDuplicates.js
@@ -11,6 +11,8 @@ const cluster = require('../lib/cluster.js')
const cloneDeep = require('clone-deep')
const xml = require('xml-js')
const _ = require('lodash')
+const { default: centroid } = require('@turf/centroid')
+const { default: distance } = require('@turf/distance')
const argv = require('yargs/yargs')(process.argv.slice(2))
.option('debug', {
@@ -20,18 +22,52 @@ const argv = require('yargs/yargs')(process.argv.slice(2))
.argv
if (argv._.length < 2) {
- console.error("Usage: ./reduceDuplicates.js input.geojson output.geojson")
+ console.error("Usage: ./reduceDuplicates.js input.geojson osmFile.geojson output.geojson")
process.exit(1)
}
const inputFile = argv._[0]
-const outputFile = argv._[1]
+const osmFile = argv._[1]
+const outputFile = argv._[2]
if (!fs.existsSync(inputFile)) {
console.error(`${inputFile} not found`)
process.exit(1)
}
+if (!fs.existsSync(osmFile)) {
+ console.error(`${osmFile} not found`)
+ process.exit(1)
+}
+
+const osmAddressKeys = {}
+
+let osmAddrCount = 0
+const indexOSM = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ osmAddrCount++
+
+ if (process.stdout.isTTY && osmAddrCount % 10000 === 0) {
+ process.stdout.write(` ${osmAddrCount.toLocaleString()}\r`)
+ }
+
+ if (feature && feature.properties) {
+ const key = [
+ feature.properties['addr:housenumber'],
+ feature.properties['addr:street']
+ ].join('|')
+ if (!(key in osmAddressKeys)) {
+ osmAddressKeys[key] = []
+ }
+ osmAddressKeys[key].push(centroid(feature))
+ }
+
+ callback()
+ }
+})
+
let sourceCount = 0
const features = {}
@@ -182,25 +218,47 @@ const reduce = new Transform({
debugStreams.multiCluster.write(webOfMatches)
// output as a MapRoulette task
- const task = {
- type: 'FeatureCollection',
- features: [
- ...groupedFeatures
- ],
- cooperativeWork: {
- meta: {
- version: 2,
- type: 2
- },
- file: {
- type: 'xml',
- format: 'osc',
- encoding: 'base64',
- content: Buffer.from(featureToOsc(groupedFeatures[0])).toString('base64') // the base64-encoded osc file
+ const firstGroupedFeature = groupedFeatures[0]
+ const firstGroupedFeatureKey = [
+ firstGroupedFeature.properties['addr:housenumber'],
+ firstGroupedFeature.properties['addr:street']
+ ].join('|')
+
+ let foundInOSM = false
+ if (firstGroupedFeatureKey in osmAddressKeys) {
+ // already found in OSM skipping
+ const closestDistance = osmAddressKeys[firstGroupedFeatureKey].map(osm => {
+ return distance(osm, centroid(firstGroupedFeature))
+ })
+ .sort((a, b) => b - a)
+ .pop()
+
+ if (closestDistance < 50) {
+ foundInOSM = true
+ }
+ }
+ if (!foundInOSM) {
+ // output
+ const task = {
+ type: 'FeatureCollection',
+ features: [
+ ...groupedFeatures
+ ],
+ cooperativeWork: {
+ meta: {
+ version: 2,
+ type: 2
+ },
+ file: {
+ type: 'xml',
+ format: 'osc',
+ encoding: 'base64',
+ content: Buffer.from(featureToOsc(groupedFeatures[0])).toString('base64') // the base64-encoded osc file
+ }
}
}
+ debugStreams.mr_duplicateAddressFarApart.write(task)
}
- debugStreams.mr_duplicateAddressFarApart.write(task)
}
}
}
@@ -267,52 +325,65 @@ if (argv.debug) {
})
}
-// first pass to index by geometry
-console.log('Pass 1/2: index by address properties')
+// first pass to index existing OSM addresses
+console.log('Pass 1/3: Store existing OSM addresses')
pipeline(
- fs.createReadStream(inputFile),
+ fs.createReadStream(osmFile),
ndjson.parse(),
- index,
+ indexOSM,
err => {
if (err) {
console.log(err)
process.exit(1)
} else {
- console.log(` of ${sourceCount.toLocaleString()} features found ${Object.keys(features).length.toLocaleString()} unique addresses`)
- // second pass to reduce duplicate features
- console.log('Pass 2/2: reduce duplicate features')
+ // second pass to index by geometry
+ console.log('Pass 2/3: index by address properties')
pipeline(
- Readable.from(Object.keys(features)),
- reduce,
- ndjson.stringify(),
- fs.createWriteStream(outputFile),
+ fs.createReadStream(inputFile),
+ ndjson.parse(),
+ index,
err => {
if (err) {
console.log(err)
process.exit(1)
} else {
- if (argv.debug) {
- debugKeys.forEach(key => {
- debugStreams[key].end()
- })
+ console.log(` of ${sourceCount.toLocaleString()} features found ${Object.keys(features).length.toLocaleString()} unique addresses`)
+ // third pass to reduce duplicate features
+ console.log('Pass 3/3: reduce duplicate features')
+ pipeline(
+ Readable.from(Object.keys(features)),
+ reduce,
+ ndjson.stringify(),
+ fs.createWriteStream(outputFile),
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ if (argv.debug) {
+ debugKeys.forEach(key => {
+ debugStreams[key].end()
+ })
- Promise.all(debugKeys.map(key => {
- return new Promise(resolve => {
- debugStreamOutputs[key].on('finish', () => {
- console.log(`saved debug/reduceDuplicates/${key}.geojson`)
- resolve()
- })
- })
- }))
- .then(() => {
- process.exit(0)
- })
- } else {
- process.exit(0)
- }
+ Promise.all(debugKeys.map(key => {
+ return new Promise(resolve => {
+ debugStreamOutputs[key].on('finish', () => {
+ console.log(`saved debug/reduceDuplicates/${key}.geojson`)
+ resolve()
+ })
+ })
+ }))
+ .then(() => {
+ process.exit(0)
+ })
+ } else {
+ process.exit(0)
+ }
+ }
+ }
+ )
}
}
)
}
- }
-)
+ })