diff options
-rw-r--r-- | Makefile | 21 | ||||
-rw-r--r-- | README.md | 79 | ||||
-rwxr-xr-x | bin/reduceDuplicates.js | 171 | ||||
-rwxr-xr-x | bin/reduceOverlap.js (renamed from reduceOverlap.js) | 58 | ||||
-rwxr-xr-x | bin/vicmap2osm.js (renamed from vicmap2osm.js) | 27 | ||||
-rw-r--r-- | filterOSM.js | 10 | ||||
-rw-r--r-- | lib/cluster.js (renamed from cluster.js) | 0 | ||||
-rw-r--r-- | lib/filterOSM.js | 16 | ||||
-rw-r--r-- | lib/toOSM.js (renamed from toOSM.js) | 14 | ||||
-rw-r--r-- | src/createIndexQuery.sql | 5 |
10 files changed, 370 insertions, 31 deletions
@@ -28,13 +28,28 @@ data/vicmap.geojson: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMADD/layer/addres ogr2ogr -f GeoJSONSeq $@ $< dist/vicmap-osm.geojson: data/vicmap.geojson - ./vicmap2osm.js $< $@ + ./bin/vicmap2osm.js $< $@ -dist/vicmap-osm-flats.geojson: dist/vicmap-osm.geojson - ./reduceOverlap.js $< $@ +dist/vicmap-osm-uniq.geojson: dist/vicmap-osm.geojson + node --max-old-space-size=4096 bin/reduceDuplicates.js $< $@ + +dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson + ./bin/reduceOverlap.js $< $@ + +loadPgOSM: dist/vicmap-osm.geojson + ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vm_osm data/vicmap.fgb: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMADD/layer/address.shp ogr2ogr -f FlatGeobuf $@ $< dist/vicmap-osm.fgb: dist/vicmap-osm.geojson ogr2ogr -f FlatGeobuf $@ $< + +# useful for development to be able to query a database +loadPgAdd: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMADD/layer/address.shp + ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vmadd + # index all columns for faster queries during development + psql -f src/createIndexQuery.sql --tuples-only | psql + +loadPgProp: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMPROP/layer/property_view.shp + ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vmprop diff --git a/README.md b/README.md new file mode 100644 index 0000000..c4315df --- /dev/null +++ b/README.md @@ -0,0 +1,79 @@ +# vicmap2osm + +Prepares [Vicmap Address](https://www.land.vic.gov.au/maps-and-spatial/spatial-data/vicmap-catalogue/vicmap-address) data for import into OpenStreetMap. + +Vicmap Address data © State of Victoria (Department of Environment, Land, Water and Planning), CC BY 4.0, with an [OSMF LWG CC waiver](https://wiki.openstreetmap.org/wiki/File:Vicmap_CCBYPermission_OSM_Final_Jan2018_Ltr.pdf). + +## GitLab CI/CD + +GitLab CI/CD automates data process in + +The _prepare_ stage downloads Vicmap Address data and converts it into GeoJSON, because this takes around 45 minutes, it's cached through CI/CD for future use. + +The _build_ stage does all the processing to produce the import candidate data and intermediate datasets and reports. + +## Build candidate files + +Download source Vicmap data and convert to GeoJSON: + + make data/vicmap.geojson + +Convert into OSM address schema, and omit addresses which don't meet our threshold for import (see _Omitted addresses_ below) (code at `bin/vicmap2osm.js`): + + make dist/vicmap-osm.geojson + +Remove duplicates where all address attributes match at the same location or within a small proximity (code at `bin/reduceDuplicates.js`): + + make dist/vicmap-osm-uniq.geojson + +Reduce some address points with the same coordinates but different address attributes (see _Overlapping points_ below) (code at `bin/reduceOverlap.js`): + + make dist/vicmap-osm-flats.geojson + +This is only done for strictly overlapping points, where the geometry varies slightly then that's okay we don't attempt to combine. + +### Omitted addresses + +Source addresses are omitted where they: + +1. have neither a `addr:housenumber` nor `addr:housename`. + +Since these addresses have no identifying attribute beyond street, and there is often multiple of these along a street all with the same street/suburb/postcode, they are of little utility and therefore omitted. + +These rules are defined in `filterOSM.js`. + +### OSM schema + +- `addr:unit` is constructed either as a single value or range where the building unit is supplied +- `addr:housename` is included where there is a building name present in the source +- `addr:housenumber` is constructed from with the number prefix, main number and number suffix fields for each of the from/to range, eg `1A-3B`. +- `addr:street` is constructed from the street proper name, street type and street suffix, formatted as capital case. eg `Main Street North`. +- `addr:suburb` is constructed from the locality value formatted as capital case. +- `addr:postcode` is as supplied. +- `addr:state` is as supplied and should always be `VIC`. + +The schema mapping mostly happens in `toOSM.js`. + +### Overlapping points + +Source address data contains many address points overlapping. + +1. First pass, where all the OSM tags are the same, and the points have the exact same geometry, all the duplicates are omitted. + +Where each of the housenumber, street, suburb, postcode, state are the same for each of the overlapping points, but only the unit value differs we attempt to reduce these to a single address point without `addr:unit` but instead using [`addr:flats`](https://wiki.openstreetmap.org/wiki/Key:addr:flats). + +`addr:flats` is the documented tag for describing the unit numbers at an address. + +In the real world where you have different unit numbers for townhouses or villas ideally you'd have different addresses in OSM using `addr:unit` but have each located on each dwelling. + +Where you have an apartment building containing multiple units, this import chooses to avoid ovelapping addresses each with a different `addr:unit` instead creating a single node with `addr:flats`. + +Where possible, unit numbers are reduced to ranges, for example to create `addr:flats=1-5;10-15;20` instead of `addr:flats=1;2;3;4;5;10;11;12;13;14;15;20`. + +Multiple points overlapping don't add any extra value to the OSM data and are are harder for mappers to manage, especially for large appartment buildings. + +Data consumers can still easily explode `addr:flats` out into overlapping nodes with varying `addr:unit` if desired. + +### null values + +Values `UNNAMED` and `NOT NAMED` appear as street name and locality names. These values are treated as null/empty values rather than proper names. diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js new file mode 100755 index 0000000..c9a7769 --- /dev/null +++ b/bin/reduceDuplicates.js @@ -0,0 +1,171 @@ +#!/usr/bin/env node + +/** + * Remove duplicates (exact tags) at the same location or within a small proximity. + */ + +const fs = require('fs') +const { Readable, Transform, pipeline } = require('stream') +const ndjson = require('ndjson') +const cluster = require('../lib/cluster.js') + +const argv = require('yargs/yargs')(process.argv.slice(2)) + .option('debug', { + type: 'boolean', + description: 'Dumps full debug logs' + }) + .argv + +if (argv._.length < 2) { + console.error("Usage: ./reduceDuplicates.js input.geojson output.geojson") + process.exit(1) +} + +const inputFile = argv._[0] +const outputFile = argv._[1] + +if (!fs.existsSync(inputFile)) { + console.error(`${inputFile} not found`) + process.exit(1) +} + +let sourceCount = 0 +const features = {} + +const index = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(feature, encoding, callback) { + sourceCount++ + + if (sourceCount % 10000 === 0) { + process.stdout.write(` ${sourceCount / 1000}k\r`) + } + + const key = [ + feature.properties['addr:unit'], + feature.properties['addr:housename'], + feature.properties['addr:housenumber'], + feature.properties['addr:street'], + feature.properties['addr:suburb'], + feature.properties['addr:state'], + feature.properties['addr:postcode'] + ].join('/') + + if (!(key in features)) { + features[key] = [] + } + features[key].push(feature) + + callback() + } +}) + +let reduceIndex = 0 +const reduce = new Transform({ + readableObjectMode: true, + writableObjectMode: true, + transform(key, encoding, callback) { + reduceIndex++ + if (reduceIndex % 10000 === 0) { + process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`) + } + + const groupedFeatures = features[key] + if (groupedFeatures.length === 1) { + // address not duplicated + + this.push(groupedFeatures[0]) + } else { + // address appears multiple times + + const sameCoordinates = [...new Set(groupedFeatures.map(f => f.geometry.coordinates.join(',')))].length <= 1 + if (sameCoordinates) { + // features have same properties and same geometry, so true duplicates can reduce to one + this.push(groupedFeatures[0]) + } else { + // cluster features with a threshold of 25m + const clusters = cluster(groupedFeatures, 25) + + // if clustered into a single cluster, then output a single average feature + if (clusters.length === 1) { + const averageCoordinates = [ + groupedFeatures.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length, + groupedFeatures.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length + ] + const averageFeature = groupedFeatures[0] + averageFeature.geometry.coordinates = averageCoordinates + + this.push(averageFeature) + } else { + // more than one cluster, reduce those clustered into one, and then report all the results + const clusterAverages = clusters.map(cluster => { + if (cluster.length === 1) { + return cluster + } else { + const averageCoordinates = [ + cluster.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length, + cluster.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length + ] + const averageFeature = cluster[0] + averageFeature.geometry.coordinates = averageCoordinates + return averageFeature + } + }) + + // report these as address points with the same attributes but different locations beyond the threshold + if (debugDuplicateAddressStream) { + const webOfMatches = { + type: 'Feature', + properties: clusterAverages[0].properties, + geometry: { + type: 'LineString', + coordinates: averageClusters.map(p => p.geometry.coordinates) + } + } + debugDuplicateAddressStream.write(webOfMatches) + } + } + } + } + + callback() + } +}) + +const debugDuplicateAddressesStream = argv.debug ? + ndjson.stringify() + .pipe(fs.createWriteStream('debug/reduceDuplicates/duplicateAddresses.geojson')) + : null + +// first pass to index by geometry +console.log('First pass to index by address properties') +pipeline( + fs.createReadStream(inputFile), + ndjson.parse(), + index, + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique addresses`) + // second pass to reduce overlapping features + pipeline( + Readable.from(Object.keys(features)), + reduce, + ndjson.stringify(), + fs.createWriteStream(outputFile), + err => { + if (err) { + console.log(err) + process.exit(1) + } else { + debugDuplicateAddressesStream.end() + process.exit(0) + } + } + ) + } + } +) diff --git a/reduceOverlap.js b/bin/reduceOverlap.js index 8b24baa..3984296 100755 --- a/reduceOverlap.js +++ b/bin/reduceOverlap.js @@ -3,16 +3,17 @@ const fs = require('fs') const { Readable, Transform, pipeline } = require('stream') const ndjson = require('ndjson') +const util = require('util') -const args = process.argv.slice(2) +const argv = require('yargs/yargs')(process.argv.slice(2)).argv -if (args.length < 2) { +if (argv._.length < 2) { console.error("Usage: ./reduceOverlap.js input.geojson output.geojson") process.exit(1) } -const inputFile = args[0] -const outputFile = args[1] +const inputFile = argv._[0] +const outputFile = argv._[1] if (!fs.existsSync(inputFile)) { console.error(`${inputFile} not found`) @@ -22,12 +23,22 @@ if (!fs.existsSync(inputFile)) { let sourceCount = 0 const features = {} +/** + * Index features by geometry. Used as a first pass, so a second pass can easily compare + * features with the same geometry. + */ const index = new Transform({ readableObjectMode: true, writableObjectMode: true, transform(feature, encoding, callback) { sourceCount++ + if (!argv.quiet) { + if (sourceCount % 10000 === 0) { + process.stdout.write(` ${sourceCount / 1000}k\r`) + } + } + const geometryKey = feature.geometry.coordinates.join(',') if (!(geometryKey in features)) { @@ -39,22 +50,34 @@ const index = new Transform({ } }) +/** + * Reduces features with the same geometry. + */ +let reduceIndex = 0 const reduce = new Transform({ readableObjectMode: true, writableObjectMode: true, transform(key, encoding, callback) { + reduceIndex++ + if (!argv.quiet) { + if (reduceIndex % 10000 === 0) { + process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`) + } + } var groupedFeatures = features[key] + if (groupedFeatures.length === 1) { - // point not overlapping + // only one feature with this geometry, nothing to reduce, output as is this.push(groupedFeatures[0]) } else { - // points overlapping, try to reduce to non-overlapping + // mulitple features with the same geometry - // if housenumber, street, suburb, state, postcode are all the same + // if housename, housenumber, street, suburb, state, postcode are all the same // and it's only unit which differs, // and there is an address with no unit // then remove all the unit addresses and add them as addr:flats on the no unit address + const sameHousename = [...new Set(groupedFeatures.map(f => f.properties['addr:housename']))].length <= 1 const sameHousenumber = [...new Set(groupedFeatures.map(f => f.properties['addr:housenumber']))].length <= 1 const sameStreet = [...new Set(groupedFeatures.map(f => f.properties['addr:street']))].length <= 1 const sameSuburb = [...new Set(groupedFeatures.map(f => f.properties['addr:suburb']))].length <= 1 @@ -63,11 +86,13 @@ const reduce = new Transform({ const hasNonUnit = groupedFeatures.map(f => 'addr:unit' in f.properties).includes(false) - if (sameHousenumber && sameStreet && sameSuburb && sameState && samePostcode) { + if (sameHousename && sameHousenumber && sameStreet && sameSuburb && sameState && samePostcode) { if (hasNonUnit) { const nonUnitFeatures = groupedFeatures.filter(f => (!('addr:unit' in f.properties))) if (nonUnitFeatures.length > 1) { // multiple non-unit features, unsure how to reduce + console.log('multiple non-unit features, unsure how to reduce') + console.dir(groupedFeatures, {depth: null}) } else { const nonUnitFeature = nonUnitFeatures[0] @@ -76,6 +101,8 @@ const reduce = new Transform({ // if allOtherUnits.length is one then that means we have one address without a unit and one with a unit at the same point // TODO should we just drop the non-unit address and keep the addr:unit one? + // need to determine if you always have a non-unit address for the unit address, if there is then + // perhaps we can safely drop the non-unit address and use a single addr:unit // adapted from https://stackoverflow.com/a/54973116/6702659 const sortedAllOtherUnitsAsRanges = allOtherUnits @@ -100,6 +127,9 @@ const reduce = new Transform({ const units = groupedFeatures.filter(f => 'addr:unit' in f.properties).map(f => f.properties['addr:unit']) // TODO assert units.length > 1 + if (units.length <= 1) { + // console.log(`all have same housenumber, street, suburb, state, postcode but no non-unit, but only found ${units.length} units`, units) + } const feature = groupedFeatures[0] delete feature.properties['addr:unit'] @@ -122,11 +152,13 @@ const reduce = new Transform({ this.push(feature) } } else { - console.log('addresses with the same geometry, however more than unit differs') + // addresses with the same geometry, however more than unit differs // TODO need to investigate to see what we can/shoud do about these - //console.log(groupedFeatures) for (let i = 0; i < groupedFeatures.length; i++) { this.push(groupedFeatures[i]) + if (debugSameGeometry) { + debugSameGeometry.write(groupedFeatures[i]) + } } } } @@ -135,6 +167,11 @@ const reduce = new Transform({ } }) +const debugSameGeometry = argv.debug ? + ndjson.stringify() + .pipe(fs.createWriteStream('debug/reduceOverlap/sameGeometry.geojson')) + : null + // first pass to index by geometry console.log('First pass to index by geometry') pipeline( @@ -158,6 +195,7 @@ pipeline( console.log(err) process.exit(1) } else { + debugSameGeometry.end() process.exit(0) } } diff --git a/vicmap2osm.js b/bin/vicmap2osm.js index e579d98..b252f24 100755 --- a/vicmap2osm.js +++ b/bin/vicmap2osm.js @@ -6,15 +6,24 @@ const ndjson = require('ndjson') const toOSM = require('./toOSM.js') const filterOSM = require('./filterOSM.js') -const args = process.argv.slice(2) +const argv = require('yargs/yargs')(process.argv.slice(2)) + .option('debug', { + type: 'boolean', + description: 'Dumps full debug logs' + }) + .option('tracing', { + type: 'boolean', + description: 'Includes _pfi tags to aid debugging' + }) + .argv -if (args.length < 2) { +if (argv._.length < 2) { console.error("Usage: ./vicmap2osm.js input.geojson output.geojson") process.exit(1) } -const inputFile = args[0] -const outputFile = args[1] +const inputFile = argv._[0] +const outputFile = argv._[1] if (!fs.existsSync(inputFile)) { console.error(`${inputFile} not found`) @@ -26,10 +35,14 @@ const transform = new Transform({ writableObjectMode: true, transform(feature, encoding, callback) { // convert source Feature into a Feature per the OSM schema - const osm = toOSM(feature) + const osm = toOSM(feature, { + tracing: argv.tracing + }) - // some addresses we skip importing into OSM - if (filterOSM(osm)) { + // some addresses we skip importing into OSM, see README.md#omitted-addresses + if (filterOSM(osm, { + debug: argv.debug + })) { this.push(osm) } diff --git a/filterOSM.js b/filterOSM.js deleted file mode 100644 index 93db4a7..0000000 --- a/filterOSM.js +++ /dev/null @@ -1,10 +0,0 @@ -module.exports = (feature) => { - - // skip any addresses without a housenumber - // eg PFI 53396626 has no housenumber - if (!('addr:housenumber' in feature.properties)) { - return false - } - - return true -} diff --git a/cluster.js b/lib/cluster.js index c716063..c716063 100644 --- a/cluster.js +++ b/lib/cluster.js diff --git a/lib/filterOSM.js b/lib/filterOSM.js new file mode 100644 index 0000000..e530773 --- /dev/null +++ b/lib/filterOSM.js @@ -0,0 +1,16 @@ +module.exports = (feature, options) => { + + // skip any addresses without either a housenumber or housename + // eg PFI 53396626 has no housenumber + if ( + !('addr:housenumber' in feature.properties) && + !('addr:housename' in feature.properties) + ) { + if (argv.debug) { + console.log(`PFI ${feature.properties._pfi} has neither a addr:housename or addr:housenumber, filtering`) + } + return false + } + + return true +} @@ -65,12 +65,22 @@ const emptyNames = [ 'NOT NAMED' ] -module.exports = (sourceFeature) => { +/** + * Transforms a GeoJSON Feature from the Vicmap address schema into OSM schema + * + * @param sourceFeature Feature in Vicmap address schema + * @returns Feature in OSM schema + */ +module.exports = (sourceFeature, options) => { const outputFeature = Object.assign({}, sourceFeature) const sourceProperties = sourceFeature.properties const outputProperties = {} + if (options && options.tracing) { + outputProperties['_pfi'] = sourceProperties.PFI + } + // Building sub address type (eg UNIT OFFICE SHOP) // // bld_unit_* @@ -104,9 +114,11 @@ module.exports = (sourceFeature) => { outputProperties['addr:unit'] = bld_unit } + /* if (sourceProperties.BLGUNTTYP && sourceProperties.BLGUNTTYP in buildingUnitType) { outputProperties['addr:unit:type'] = buildingUnitType[sourceProperties.BLGUNTTYP] } + */ if (sourceProperties.BUILDING) { outputProperties['addr:housename'] = sourceProperties.BUILDING diff --git a/src/createIndexQuery.sql b/src/createIndexQuery.sql new file mode 100644 index 0000000..a4e2712 --- /dev/null +++ b/src/createIndexQuery.sql @@ -0,0 +1,5 @@ +SELECT + 'CREATE INDEX CONCURRENTLY ' || table_name || '_' || column_name || ' ON ' || table_name || ' ("' || column_name || '");' +FROM information_schema.columns +WHERE table_schema = 'public' + AND table_name = 'vmadd'; |