diff options
Diffstat (limited to 'bin')
-rwxr-xr-x | bin/reduceDuplicates.js | 171 | ||||
-rwxr-xr-x | bin/reduceOverlap.js | 205 | ||||
-rwxr-xr-x | bin/vicmap2osm.js | 68 |
3 files changed, 444 insertions, 0 deletions
diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js new file mode 100755 index 0000000..c9a7769 --- /dev/null +++ b/bin/reduceDuplicates.js @@ -0,0 +1,171 @@ +#!/usr/bin/env node + +/** + * Remove duplicates (exact tags) at the same location or within a small proximity. + */ + +const fs = require('fs') +const { Readable, Transform, pipeline } = require('stream') +const ndjson = require('ndjson') +const cluster = require('../lib/cluster.js') + +const argv = require('yargs/yargs')(process.argv.slice(2)) + .option('debug', { + type: 'boolean', + description: 'Dumps full debug logs' + }) + .argv + +if (argv._.length < 2) { + console.error("Usage: ./reduceDuplicates.js input.geojson output.geojson") + process.exit(1) +} + +const inputFile = argv._[0] +const outputFile = argv._[1] + +if (!fs.existsSync(inputFile)) { + console.error(`${inputFile} not found`) + process.exit(1) +} + +let sourceCount = 0 +const features = {} + +const index = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + sourceCount++ + + if (sourceCount % 10000 === 0) { + process.stdout.write(` ${sourceCount / 1000}k\r`) + } + + const key = [ + feature.properties['addr:unit'], + feature.properties['addr:housename'], + feature.properties['addr:housenumber'], + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') + + if (!(key in features)) { + features[key] = [] + } + features[key].push(feature) + + callback() + } +}) + +let reduceIndex = 0 +const reduce = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(key, encoding, callback) { + reduceIndex++ + if (reduceIndex % 10000 === 0) { + process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`) + } + + const groupedFeatures = features[key] + if (groupedFeatures.length === 1) { + // address not duplicated + + this.push(groupedFeatures[0]) + } else { + // address appears multiple times + + const sameCoordinates = [...new Set(groupedFeatures.map(f => f.geometry.coordinates.join(',')))].length <= 1 + if (sameCoordinates) { + // features have same properties and same geometry, so true duplicates can reduce to one + this.push(groupedFeatures[0]) + } else { + // cluster features with a threshold of 25m + const clusters = cluster(groupedFeatures, 25) + + // if clustered into a single cluster, then output a single average feature + if (clusters.length === 1) { + const averageCoordinates = [ + groupedFeatures.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length, + groupedFeatures.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length + ] + const averageFeature = groupedFeatures[0] + averageFeature.geometry.coordinates = averageCoordinates + + this.push(averageFeature) + } else { + // more than one cluster, reduce those clustered into one, and then report all the results + const clusterAverages = clusters.map(cluster => { + if (cluster.length === 1) { + return cluster + } else { + const averageCoordinates = [ + cluster.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length, + cluster.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length + ] + const averageFeature = cluster[0] + averageFeature.geometry.coordinates = averageCoordinates + return averageFeature + } + }) + + // report these as address points with the same attributes but different locations beyond the threshold + if (debugDuplicateAddressStream) { + const webOfMatches = { + type: 'Feature', + properties: clusterAverages[0].properties, + geometry: { + type: 'LineString', + coordinates: averageClusters.map(p => p.geometry.coordinates) + } + } + debugDuplicateAddressStream.write(webOfMatches) + } + } + } + } + + callback() + } +}) + +const debugDuplicateAddressesStream = argv.debug ? + ndjson.stringify() + .pipe(fs.createWriteStream('debug/reduceDuplicates/duplicateAddresses.geojson')) + : null + +// first pass to index by geometry +console.log('First pass to index by address properties') +pipeline( + fs.createReadStream(inputFile), + ndjson.parse(), + index, + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique addresses`) + // second pass to reduce overlapping features + pipeline( + Readable.from(Object.keys(features)), + reduce, + ndjson.stringify(), + fs.createWriteStream(outputFile), + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + debugDuplicateAddressesStream.end() + process.exit(0) + } + } + ) + } + } +) diff --git a/bin/reduceOverlap.js b/bin/reduceOverlap.js new file mode 100755 index 0000000..3984296 --- /dev/null +++ b/bin/reduceOverlap.js @@ -0,0 +1,205 @@ +#!/usr/bin/env node + +const fs = require('fs') +const { Readable, Transform, pipeline } = require('stream') +const ndjson = require('ndjson') +const util = require('util') + +const argv = require('yargs/yargs')(process.argv.slice(2)).argv + +if (argv._.length < 2) { + console.error("Usage: ./reduceOverlap.js input.geojson output.geojson") + process.exit(1) +} + +const inputFile = argv._[0] +const outputFile = argv._[1] + +if (!fs.existsSync(inputFile)) { + console.error(`${inputFile} not found`) + process.exit(1) +} + +let sourceCount = 0 +const features = {} + +/** + * Index features by geometry. Used as a first pass, so a second pass can easily compare + * features with the same geometry. + */ +const index = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + sourceCount++ + + if (!argv.quiet) { + if (sourceCount % 10000 === 0) { + process.stdout.write(` ${sourceCount / 1000}k\r`) + } + } + + const geometryKey = feature.geometry.coordinates.join(',') + + if (!(geometryKey in features)) { + features[geometryKey] = [] + } + features[geometryKey].push(feature) + + callback() + } +}) + +/** + * Reduces features with the same geometry. + */ +let reduceIndex = 0 +const reduce = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(key, encoding, callback) { + reduceIndex++ + if (!argv.quiet) { + if (reduceIndex % 10000 === 0) { + process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`) + } + } + + var groupedFeatures = features[key] + + if (groupedFeatures.length === 1) { + // only one feature with this geometry, nothing to reduce, output as is + this.push(groupedFeatures[0]) + } else { + // mulitple features with the same geometry + + // if housename, housenumber, street, suburb, state, postcode are all the same + // and it's only unit which differs, + // and there is an address with no unit + // then remove all the unit addresses and add them as addr:flats on the no unit address + const sameHousename = [...new Set(groupedFeatures.map(f => f.properties['addr:housename']))].length <= 1 + const sameHousenumber = [...new Set(groupedFeatures.map(f => f.properties['addr:housenumber']))].length <= 1 + const sameStreet = [...new Set(groupedFeatures.map(f => f.properties['addr:street']))].length <= 1 + const sameSuburb = [...new Set(groupedFeatures.map(f => f.properties['addr:suburb']))].length <= 1 + const sameState = [...new Set(groupedFeatures.map(f => f.properties['addr:state']))].length <= 1 + const samePostcode = [...new Set(groupedFeatures.map(f => f.properties['addr:postcode']))].length <= 1 + + const hasNonUnit = groupedFeatures.map(f => 'addr:unit' in f.properties).includes(false) + + if (sameHousename && sameHousenumber && sameStreet && sameSuburb && sameState && samePostcode) { + if (hasNonUnit) { + const nonUnitFeatures = groupedFeatures.filter(f => (!('addr:unit' in f.properties))) + if (nonUnitFeatures.length > 1) { + // multiple non-unit features, unsure how to reduce + console.log('multiple non-unit features, unsure how to reduce') + console.dir(groupedFeatures, {depth: null}) + } else { + const nonUnitFeature = nonUnitFeatures[0] + + // place all the other addr:unit into addr:flats + const allOtherUnits = groupedFeatures.filter(f => 'addr:unit' in f.properties).map(f => f.properties['addr:unit']) + + // if allOtherUnits.length is one then that means we have one address without a unit and one with a unit at the same point + // TODO should we just drop the non-unit address and keep the addr:unit one? + // need to determine if you always have a non-unit address for the unit address, if there is then + // perhaps we can safely drop the non-unit address and use a single addr:unit + + // adapted from https://stackoverflow.com/a/54973116/6702659 + const sortedAllOtherUnitsAsRanges = allOtherUnits + .slice() + .sort((a, b) => a - b) + .reduce((acc, cur, idx, src) => { + if ((idx > 0) && ((cur - src[idx - 1]) === 1)) { + acc[acc.length - 1][1] = cur + } else { + acc.push([cur]) + } + return acc + }, []) + .map(range => range.join('-')) + + nonUnitFeature.properties['addr:flats'] = sortedAllOtherUnitsAsRanges.join(';') + this.push(nonUnitFeature) + } + } else { + // all have same housenumber, street, suburb, state, postcode but no non-unit + // combine all the addr:unit into addr:flats and then drop addr:unit + const units = groupedFeatures.filter(f => 'addr:unit' in f.properties).map(f => f.properties['addr:unit']) + + // TODO assert units.length > 1 + if (units.length <= 1) { + // console.log(`all have same housenumber, street, suburb, state, postcode but no non-unit, but only found ${units.length} units`, units) + } + + const feature = groupedFeatures[0] + delete feature.properties['addr:unit'] + + // adapted from https://stackoverflow.com/a/54973116/6702659 + const unitRanges = units + .slice() + .sort((a, b) => a - b) + .reduce((acc, cur, idx, src) => { + if ((idx > 0) && ((cur - src[idx - 1]) === 1)) { + acc[acc.length - 1][1] = cur + } else { + acc.push([cur]) + } + return acc + }, []) + .map(range => range.join('-')) + + feature.properties['addr:flats'] = unitRanges.join(';') + this.push(feature) + } + } else { + // addresses with the same geometry, however more than unit differs + // TODO need to investigate to see what we can/shoud do about these + for (let i = 0; i < groupedFeatures.length; i++) { + this.push(groupedFeatures[i]) + if (debugSameGeometry) { + debugSameGeometry.write(groupedFeatures[i]) + } + } + } + } + + callback() + } +}) + +const debugSameGeometry = argv.debug ? + ndjson.stringify() + .pipe(fs.createWriteStream('debug/reduceOverlap/sameGeometry.geojson')) + : null + +// first pass to index by geometry +console.log('First pass to index by geometry') +pipeline( + fs.createReadStream(inputFile), + ndjson.parse(), + index, + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique geometries`) + // second pass to reduce overlapping features + pipeline( + Readable.from(Object.keys(features)), + reduce, + ndjson.stringify(), + fs.createWriteStream(outputFile), + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + debugSameGeometry.end() + process.exit(0) + } + } + ) + } + } +) diff --git a/bin/vicmap2osm.js b/bin/vicmap2osm.js new file mode 100755 index 0000000..b252f24 --- /dev/null +++ b/bin/vicmap2osm.js @@ -0,0 +1,68 @@ +#!/usr/bin/env node + +const fs = require('fs') +const { Transform, pipeline } = require('readable-stream') +const ndjson = require('ndjson') +const toOSM = require('./toOSM.js') +const filterOSM = require('./filterOSM.js') + +const argv = require('yargs/yargs')(process.argv.slice(2)) + .option('debug', { + type: 'boolean', + description: 'Dumps full debug logs' + }) + .option('tracing', { + type: 'boolean', + description: 'Includes _pfi tags to aid debugging' + }) + .argv + +if (argv._.length < 2) { + console.error("Usage: ./vicmap2osm.js input.geojson output.geojson") + process.exit(1) +} + +const inputFile = argv._[0] +const outputFile = argv._[1] + +if (!fs.existsSync(inputFile)) { + console.error(`${inputFile} not found`) + process.exit(1) +} + +const transform = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + // convert source Feature into a Feature per the OSM schema + const osm = toOSM(feature, { + tracing: argv.tracing + }) + + // some addresses we skip importing into OSM, see README.md#omitted-addresses + if (filterOSM(osm, { + debug: argv.debug + })) { + this.push(osm) + } + + callback() + } +}) + +// stream in source ndjson, transfom and stream out +pipeline( + fs.createReadStream(inputFile), + ndjson.parse(), + transform, + ndjson.stringify(), + fs.createWriteStream(outputFile), + (err) => { + if (err) { + console.log(err) + process.exit(1) + } else { + process.exit(0) + } + } +) |