aboutsummaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
Diffstat (limited to 'bin')
-rwxr-xr-xbin/reduceDuplicates.js171
-rwxr-xr-xbin/reduceOverlap.js205
-rwxr-xr-xbin/vicmap2osm.js68
3 files changed, 444 insertions, 0 deletions
diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js
new file mode 100755
index 0000000..c9a7769
--- /dev/null
+++ b/bin/reduceDuplicates.js
@@ -0,0 +1,171 @@
+#!/usr/bin/env node
+
+/**
+ * Remove duplicates (exact tags) at the same location or within a small proximity.
+ */
+
+const fs = require('fs')
+const { Readable, Transform, pipeline } = require('stream')
+const ndjson = require('ndjson')
+const cluster = require('../lib/cluster.js')
+
+const argv = require('yargs/yargs')(process.argv.slice(2))
+ .option('debug', {
+ type: 'boolean',
+ description: 'Dumps full debug logs'
+ })
+ .argv
+
+if (argv._.length < 2) {
+ console.error("Usage: ./reduceDuplicates.js input.geojson output.geojson")
+ process.exit(1)
+}
+
+const inputFile = argv._[0]
+const outputFile = argv._[1]
+
+if (!fs.existsSync(inputFile)) {
+ console.error(`${inputFile} not found`)
+ process.exit(1)
+}
+
+let sourceCount = 0
+const features = {}
+
+const index = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ sourceCount++
+
+ if (sourceCount % 10000 === 0) {
+ process.stdout.write(` ${sourceCount / 1000}k\r`)
+ }
+
+ const key = [
+ feature.properties['addr:unit'],
+ feature.properties['addr:housename'],
+ feature.properties['addr:housenumber'],
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
+
+ if (!(key in features)) {
+ features[key] = []
+ }
+ features[key].push(feature)
+
+ callback()
+ }
+})
+
+let reduceIndex = 0
+const reduce = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(key, encoding, callback) {
+ reduceIndex++
+ if (reduceIndex % 10000 === 0) {
+ process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`)
+ }
+
+ const groupedFeatures = features[key]
+ if (groupedFeatures.length === 1) {
+ // address not duplicated
+
+ this.push(groupedFeatures[0])
+ } else {
+ // address appears multiple times
+
+ const sameCoordinates = [...new Set(groupedFeatures.map(f => f.geometry.coordinates.join(',')))].length <= 1
+ if (sameCoordinates) {
+ // features have same properties and same geometry, so true duplicates can reduce to one
+ this.push(groupedFeatures[0])
+ } else {
+ // cluster features with a threshold of 25m
+ const clusters = cluster(groupedFeatures, 25)
+
+ // if clustered into a single cluster, then output a single average feature
+ if (clusters.length === 1) {
+ const averageCoordinates = [
+ groupedFeatures.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length,
+ groupedFeatures.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length
+ ]
+ const averageFeature = groupedFeatures[0]
+ averageFeature.geometry.coordinates = averageCoordinates
+
+ this.push(averageFeature)
+ } else {
+ // more than one cluster, reduce those clustered into one, and then report all the results
+ const clusterAverages = clusters.map(cluster => {
+ if (cluster.length === 1) {
+ return cluster
+ } else {
+ const averageCoordinates = [
+ cluster.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length,
+ cluster.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length
+ ]
+ const averageFeature = cluster[0]
+ averageFeature.geometry.coordinates = averageCoordinates
+ return averageFeature
+ }
+ })
+
+ // report these as address points with the same attributes but different locations beyond the threshold
+ if (debugDuplicateAddressStream) {
+ const webOfMatches = {
+ type: 'Feature',
+ properties: clusterAverages[0].properties,
+ geometry: {
+ type: 'LineString',
+ coordinates: averageClusters.map(p => p.geometry.coordinates)
+ }
+ }
+ debugDuplicateAddressStream.write(webOfMatches)
+ }
+ }
+ }
+ }
+
+ callback()
+ }
+})
+
+const debugDuplicateAddressesStream = argv.debug ?
+ ndjson.stringify()
+ .pipe(fs.createWriteStream('debug/reduceDuplicates/duplicateAddresses.geojson'))
+ : null
+
+// first pass to index by geometry
+console.log('First pass to index by address properties')
+pipeline(
+ fs.createReadStream(inputFile),
+ ndjson.parse(),
+ index,
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique addresses`)
+ // second pass to reduce overlapping features
+ pipeline(
+ Readable.from(Object.keys(features)),
+ reduce,
+ ndjson.stringify(),
+ fs.createWriteStream(outputFile),
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ debugDuplicateAddressesStream.end()
+ process.exit(0)
+ }
+ }
+ )
+ }
+ }
+)
diff --git a/bin/reduceOverlap.js b/bin/reduceOverlap.js
new file mode 100755
index 0000000..3984296
--- /dev/null
+++ b/bin/reduceOverlap.js
@@ -0,0 +1,205 @@
+#!/usr/bin/env node
+
+const fs = require('fs')
+const { Readable, Transform, pipeline } = require('stream')
+const ndjson = require('ndjson')
+const util = require('util')
+
+const argv = require('yargs/yargs')(process.argv.slice(2)).argv
+
+if (argv._.length < 2) {
+ console.error("Usage: ./reduceOverlap.js input.geojson output.geojson")
+ process.exit(1)
+}
+
+const inputFile = argv._[0]
+const outputFile = argv._[1]
+
+if (!fs.existsSync(inputFile)) {
+ console.error(`${inputFile} not found`)
+ process.exit(1)
+}
+
+let sourceCount = 0
+const features = {}
+
+/**
+ * Index features by geometry. Used as a first pass, so a second pass can easily compare
+ * features with the same geometry.
+ */
+const index = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ sourceCount++
+
+ if (!argv.quiet) {
+ if (sourceCount % 10000 === 0) {
+ process.stdout.write(` ${sourceCount / 1000}k\r`)
+ }
+ }
+
+ const geometryKey = feature.geometry.coordinates.join(',')
+
+ if (!(geometryKey in features)) {
+ features[geometryKey] = []
+ }
+ features[geometryKey].push(feature)
+
+ callback()
+ }
+})
+
+/**
+ * Reduces features with the same geometry.
+ */
+let reduceIndex = 0
+const reduce = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(key, encoding, callback) {
+ reduceIndex++
+ if (!argv.quiet) {
+ if (reduceIndex % 10000 === 0) {
+ process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`)
+ }
+ }
+
+ var groupedFeatures = features[key]
+
+ if (groupedFeatures.length === 1) {
+ // only one feature with this geometry, nothing to reduce, output as is
+ this.push(groupedFeatures[0])
+ } else {
+ // mulitple features with the same geometry
+
+ // if housename, housenumber, street, suburb, state, postcode are all the same
+ // and it's only unit which differs,
+ // and there is an address with no unit
+ // then remove all the unit addresses and add them as addr:flats on the no unit address
+ const sameHousename = [...new Set(groupedFeatures.map(f => f.properties['addr:housename']))].length <= 1
+ const sameHousenumber = [...new Set(groupedFeatures.map(f => f.properties['addr:housenumber']))].length <= 1
+ const sameStreet = [...new Set(groupedFeatures.map(f => f.properties['addr:street']))].length <= 1
+ const sameSuburb = [...new Set(groupedFeatures.map(f => f.properties['addr:suburb']))].length <= 1
+ const sameState = [...new Set(groupedFeatures.map(f => f.properties['addr:state']))].length <= 1
+ const samePostcode = [...new Set(groupedFeatures.map(f => f.properties['addr:postcode']))].length <= 1
+
+ const hasNonUnit = groupedFeatures.map(f => 'addr:unit' in f.properties).includes(false)
+
+ if (sameHousename && sameHousenumber && sameStreet && sameSuburb && sameState && samePostcode) {
+ if (hasNonUnit) {
+ const nonUnitFeatures = groupedFeatures.filter(f => (!('addr:unit' in f.properties)))
+ if (nonUnitFeatures.length > 1) {
+ // multiple non-unit features, unsure how to reduce
+ console.log('multiple non-unit features, unsure how to reduce')
+ console.dir(groupedFeatures, {depth: null})
+ } else {
+ const nonUnitFeature = nonUnitFeatures[0]
+
+ // place all the other addr:unit into addr:flats
+ const allOtherUnits = groupedFeatures.filter(f => 'addr:unit' in f.properties).map(f => f.properties['addr:unit'])
+
+ // if allOtherUnits.length is one then that means we have one address without a unit and one with a unit at the same point
+ // TODO should we just drop the non-unit address and keep the addr:unit one?
+ // need to determine if you always have a non-unit address for the unit address, if there is then
+ // perhaps we can safely drop the non-unit address and use a single addr:unit
+
+ // adapted from https://stackoverflow.com/a/54973116/6702659
+ const sortedAllOtherUnitsAsRanges = allOtherUnits
+ .slice()
+ .sort((a, b) => a - b)
+ .reduce((acc, cur, idx, src) => {
+ if ((idx > 0) && ((cur - src[idx - 1]) === 1)) {
+ acc[acc.length - 1][1] = cur
+ } else {
+ acc.push([cur])
+ }
+ return acc
+ }, [])
+ .map(range => range.join('-'))
+
+ nonUnitFeature.properties['addr:flats'] = sortedAllOtherUnitsAsRanges.join(';')
+ this.push(nonUnitFeature)
+ }
+ } else {
+ // all have same housenumber, street, suburb, state, postcode but no non-unit
+ // combine all the addr:unit into addr:flats and then drop addr:unit
+ const units = groupedFeatures.filter(f => 'addr:unit' in f.properties).map(f => f.properties['addr:unit'])
+
+ // TODO assert units.length > 1
+ if (units.length <= 1) {
+ // console.log(`all have same housenumber, street, suburb, state, postcode but no non-unit, but only found ${units.length} units`, units)
+ }
+
+ const feature = groupedFeatures[0]
+ delete feature.properties['addr:unit']
+
+ // adapted from https://stackoverflow.com/a/54973116/6702659
+ const unitRanges = units
+ .slice()
+ .sort((a, b) => a - b)
+ .reduce((acc, cur, idx, src) => {
+ if ((idx > 0) && ((cur - src[idx - 1]) === 1)) {
+ acc[acc.length - 1][1] = cur
+ } else {
+ acc.push([cur])
+ }
+ return acc
+ }, [])
+ .map(range => range.join('-'))
+
+ feature.properties['addr:flats'] = unitRanges.join(';')
+ this.push(feature)
+ }
+ } else {
+ // addresses with the same geometry, however more than unit differs
+ // TODO need to investigate to see what we can/shoud do about these
+ for (let i = 0; i < groupedFeatures.length; i++) {
+ this.push(groupedFeatures[i])
+ if (debugSameGeometry) {
+ debugSameGeometry.write(groupedFeatures[i])
+ }
+ }
+ }
+ }
+
+ callback()
+ }
+})
+
+const debugSameGeometry = argv.debug ?
+ ndjson.stringify()
+ .pipe(fs.createWriteStream('debug/reduceOverlap/sameGeometry.geojson'))
+ : null
+
+// first pass to index by geometry
+console.log('First pass to index by geometry')
+pipeline(
+ fs.createReadStream(inputFile),
+ ndjson.parse(),
+ index,
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique geometries`)
+ // second pass to reduce overlapping features
+ pipeline(
+ Readable.from(Object.keys(features)),
+ reduce,
+ ndjson.stringify(),
+ fs.createWriteStream(outputFile),
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ debugSameGeometry.end()
+ process.exit(0)
+ }
+ }
+ )
+ }
+ }
+)
diff --git a/bin/vicmap2osm.js b/bin/vicmap2osm.js
new file mode 100755
index 0000000..b252f24
--- /dev/null
+++ b/bin/vicmap2osm.js
@@ -0,0 +1,68 @@
+#!/usr/bin/env node
+
+const fs = require('fs')
+const { Transform, pipeline } = require('readable-stream')
+const ndjson = require('ndjson')
+const toOSM = require('./toOSM.js')
+const filterOSM = require('./filterOSM.js')
+
+const argv = require('yargs/yargs')(process.argv.slice(2))
+ .option('debug', {
+ type: 'boolean',
+ description: 'Dumps full debug logs'
+ })
+ .option('tracing', {
+ type: 'boolean',
+ description: 'Includes _pfi tags to aid debugging'
+ })
+ .argv
+
+if (argv._.length < 2) {
+ console.error("Usage: ./vicmap2osm.js input.geojson output.geojson")
+ process.exit(1)
+}
+
+const inputFile = argv._[0]
+const outputFile = argv._[1]
+
+if (!fs.existsSync(inputFile)) {
+ console.error(`${inputFile} not found`)
+ process.exit(1)
+}
+
+const transform = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ // convert source Feature into a Feature per the OSM schema
+ const osm = toOSM(feature, {
+ tracing: argv.tracing
+ })
+
+ // some addresses we skip importing into OSM, see README.md#omitted-addresses
+ if (filterOSM(osm, {
+ debug: argv.debug
+ })) {
+ this.push(osm)
+ }
+
+ callback()
+ }
+})
+
+// stream in source ndjson, transfom and stream out
+pipeline(
+ fs.createReadStream(inputFile),
+ ndjson.parse(),
+ transform,
+ ndjson.stringify(),
+ fs.createWriteStream(outputFile),
+ (err) => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ process.exit(0)
+ }
+ }
+)