aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile21
-rw-r--r--README.md79
-rwxr-xr-xbin/reduceDuplicates.js171
-rwxr-xr-xbin/reduceOverlap.js (renamed from reduceOverlap.js)58
-rwxr-xr-xbin/vicmap2osm.js (renamed from vicmap2osm.js)27
-rw-r--r--filterOSM.js10
-rw-r--r--lib/cluster.js (renamed from cluster.js)0
-rw-r--r--lib/filterOSM.js16
-rw-r--r--lib/toOSM.js (renamed from toOSM.js)14
-rw-r--r--src/createIndexQuery.sql5
10 files changed, 370 insertions, 31 deletions
diff --git a/Makefile b/Makefile
index 2a723e4..f295eef 100644
--- a/Makefile
+++ b/Makefile
@@ -28,13 +28,28 @@ data/vicmap.geojson: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMADD/layer/addres
ogr2ogr -f GeoJSONSeq $@ $<
dist/vicmap-osm.geojson: data/vicmap.geojson
- ./vicmap2osm.js $< $@
+ ./bin/vicmap2osm.js $< $@
-dist/vicmap-osm-flats.geojson: dist/vicmap-osm.geojson
- ./reduceOverlap.js $< $@
+dist/vicmap-osm-uniq.geojson: dist/vicmap-osm.geojson
+ node --max-old-space-size=4096 bin/reduceDuplicates.js $< $@
+
+dist/vicmap-osm-uniq-flats.geojson: dist/vicmap-osm-uniq.geojson
+ ./bin/reduceOverlap.js $< $@
+
+loadPgOSM: dist/vicmap-osm.geojson
+ ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vm_osm
data/vicmap.fgb: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMADD/layer/address.shp
ogr2ogr -f FlatGeobuf $@ $<
dist/vicmap-osm.fgb: dist/vicmap-osm.geojson
ogr2ogr -f FlatGeobuf $@ $<
+
+# useful for development to be able to query a database
+loadPgAdd: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMADD/layer/address.shp
+ ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vmadd
+ # index all columns for faster queries during development
+ psql -f src/createIndexQuery.sql --tuples-only | psql
+
+loadPgProp: data/vicmap/ll_gda94/sde_shape/whole/VIC/VMPROP/layer/property_view.shp
+ ogr2ogr -f PostgreSQL PG: $< -lco UNLOGGED=YES -nln vmprop
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c4315df
--- /dev/null
+++ b/README.md
@@ -0,0 +1,79 @@
+# vicmap2osm
+
+Prepares [Vicmap Address](https://www.land.vic.gov.au/maps-and-spatial/spatial-data/vicmap-catalogue/vicmap-address) data for import into OpenStreetMap.
+
+Vicmap Address data © State of Victoria (Department of Environment, Land, Water and Planning), CC BY 4.0, with an [OSMF LWG CC waiver](https://wiki.openstreetmap.org/wiki/File:Vicmap_CCBYPermission_OSM_Final_Jan2018_Ltr.pdf).
+
+## GitLab CI/CD
+
+GitLab CI/CD automates data process in
+
+The _prepare_ stage downloads Vicmap Address data and converts it into GeoJSON, because this takes around 45 minutes, it's cached through CI/CD for future use.
+
+The _build_ stage does all the processing to produce the import candidate data and intermediate datasets and reports.
+
+## Build candidate files
+
+Download source Vicmap data and convert to GeoJSON:
+
+ make data/vicmap.geojson
+
+Convert into OSM address schema, and omit addresses which don't meet our threshold for import (see _Omitted addresses_ below) (code at `bin/vicmap2osm.js`):
+
+ make dist/vicmap-osm.geojson
+
+Remove duplicates where all address attributes match at the same location or within a small proximity (code at `bin/reduceDuplicates.js`):
+
+ make dist/vicmap-osm-uniq.geojson
+
+Reduce some address points with the same coordinates but different address attributes (see _Overlapping points_ below) (code at `bin/reduceOverlap.js`):
+
+ make dist/vicmap-osm-flats.geojson
+
+This is only done for strictly overlapping points, where the geometry varies slightly then that's okay we don't attempt to combine.
+
+### Omitted addresses
+
+Source addresses are omitted where they:
+
+1. have neither a `addr:housenumber` nor `addr:housename`.
+
+Since these addresses have no identifying attribute beyond street, and there is often multiple of these along a street all with the same street/suburb/postcode, they are of little utility and therefore omitted.
+
+These rules are defined in `filterOSM.js`.
+
+### OSM schema
+
+- `addr:unit` is constructed either as a single value or range where the building unit is supplied
+- `addr:housename` is included where there is a building name present in the source
+- `addr:housenumber` is constructed from with the number prefix, main number and number suffix fields for each of the from/to range, eg `1A-3B`.
+- `addr:street` is constructed from the street proper name, street type and street suffix, formatted as capital case. eg `Main Street North`.
+- `addr:suburb` is constructed from the locality value formatted as capital case.
+- `addr:postcode` is as supplied.
+- `addr:state` is as supplied and should always be `VIC`.
+
+The schema mapping mostly happens in `toOSM.js`.
+
+### Overlapping points
+
+Source address data contains many address points overlapping.
+
+1. First pass, where all the OSM tags are the same, and the points have the exact same geometry, all the duplicates are omitted.
+
+Where each of the housenumber, street, suburb, postcode, state are the same for each of the overlapping points, but only the unit value differs we attempt to reduce these to a single address point without `addr:unit` but instead using [`addr:flats`](https://wiki.openstreetmap.org/wiki/Key:addr:flats).
+
+`addr:flats` is the documented tag for describing the unit numbers at an address.
+
+In the real world where you have different unit numbers for townhouses or villas ideally you'd have different addresses in OSM using `addr:unit` but have each located on each dwelling.
+
+Where you have an apartment building containing multiple units, this import chooses to avoid ovelapping addresses each with a different `addr:unit` instead creating a single node with `addr:flats`.
+
+Where possible, unit numbers are reduced to ranges, for example to create `addr:flats=1-5;10-15;20` instead of `addr:flats=1;2;3;4;5;10;11;12;13;14;15;20`.
+
+Multiple points overlapping don't add any extra value to the OSM data and are are harder for mappers to manage, especially for large appartment buildings.
+
+Data consumers can still easily explode `addr:flats` out into overlapping nodes with varying `addr:unit` if desired.
+
+### null values
+
+Values `UNNAMED` and `NOT NAMED` appear as street name and locality names. These values are treated as null/empty values rather than proper names.
diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js
new file mode 100755
index 0000000..c9a7769
--- /dev/null
+++ b/bin/reduceDuplicates.js
@@ -0,0 +1,171 @@
+#!/usr/bin/env node
+
+/**
+ * Remove duplicates (exact tags) at the same location or within a small proximity.
+ */
+
+const fs = require('fs')
+const { Readable, Transform, pipeline } = require('stream')
+const ndjson = require('ndjson')
+const cluster = require('../lib/cluster.js')
+
+const argv = require('yargs/yargs')(process.argv.slice(2))
+ .option('debug', {
+ type: 'boolean',
+ description: 'Dumps full debug logs'
+ })
+ .argv
+
+if (argv._.length < 2) {
+ console.error("Usage: ./reduceDuplicates.js input.geojson output.geojson")
+ process.exit(1)
+}
+
+const inputFile = argv._[0]
+const outputFile = argv._[1]
+
+if (!fs.existsSync(inputFile)) {
+ console.error(`${inputFile} not found`)
+ process.exit(1)
+}
+
+let sourceCount = 0
+const features = {}
+
+const index = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(feature, encoding, callback) {
+ sourceCount++
+
+ if (sourceCount % 10000 === 0) {
+ process.stdout.write(` ${sourceCount / 1000}k\r`)
+ }
+
+ const key = [
+ feature.properties['addr:unit'],
+ feature.properties['addr:housename'],
+ feature.properties['addr:housenumber'],
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
+
+ if (!(key in features)) {
+ features[key] = []
+ }
+ features[key].push(feature)
+
+ callback()
+ }
+})
+
+let reduceIndex = 0
+const reduce = new Transform({
+ readableObjectMode: true,
+ writableObjectMode: true,
+ transform(key, encoding, callback) {
+ reduceIndex++
+ if (reduceIndex % 10000 === 0) {
+ process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`)
+ }
+
+ const groupedFeatures = features[key]
+ if (groupedFeatures.length === 1) {
+ // address not duplicated
+
+ this.push(groupedFeatures[0])
+ } else {
+ // address appears multiple times
+
+ const sameCoordinates = [...new Set(groupedFeatures.map(f => f.geometry.coordinates.join(',')))].length <= 1
+ if (sameCoordinates) {
+ // features have same properties and same geometry, so true duplicates can reduce to one
+ this.push(groupedFeatures[0])
+ } else {
+ // cluster features with a threshold of 25m
+ const clusters = cluster(groupedFeatures, 25)
+
+ // if clustered into a single cluster, then output a single average feature
+ if (clusters.length === 1) {
+ const averageCoordinates = [
+ groupedFeatures.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length,
+ groupedFeatures.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length
+ ]
+ const averageFeature = groupedFeatures[0]
+ averageFeature.geometry.coordinates = averageCoordinates
+
+ this.push(averageFeature)
+ } else {
+ // more than one cluster, reduce those clustered into one, and then report all the results
+ const clusterAverages = clusters.map(cluster => {
+ if (cluster.length === 1) {
+ return cluster
+ } else {
+ const averageCoordinates = [
+ cluster.map(f => f.geometry.coordinates[0]).reduce((acc, cur) => acc + cur) / groupedFeatures.length,
+ cluster.map(f => f.geometry.coordinates[1]).reduce((acc, cur) => acc + cur) / groupedFeatures.length
+ ]
+ const averageFeature = cluster[0]
+ averageFeature.geometry.coordinates = averageCoordinates
+ return averageFeature
+ }
+ })
+
+ // report these as address points with the same attributes but different locations beyond the threshold
+ if (debugDuplicateAddressStream) {
+ const webOfMatches = {
+ type: 'Feature',
+ properties: clusterAverages[0].properties,
+ geometry: {
+ type: 'LineString',
+ coordinates: averageClusters.map(p => p.geometry.coordinates)
+ }
+ }
+ debugDuplicateAddressStream.write(webOfMatches)
+ }
+ }
+ }
+ }
+
+ callback()
+ }
+})
+
+const debugDuplicateAddressesStream = argv.debug ?
+ ndjson.stringify()
+ .pipe(fs.createWriteStream('debug/reduceDuplicates/duplicateAddresses.geojson'))
+ : null
+
+// first pass to index by geometry
+console.log('First pass to index by address properties')
+pipeline(
+ fs.createReadStream(inputFile),
+ ndjson.parse(),
+ index,
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique addresses`)
+ // second pass to reduce overlapping features
+ pipeline(
+ Readable.from(Object.keys(features)),
+ reduce,
+ ndjson.stringify(),
+ fs.createWriteStream(outputFile),
+ err => {
+ if (err) {
+ console.log(err)
+ process.exit(1)
+ } else {
+ debugDuplicateAddressesStream.end()
+ process.exit(0)
+ }
+ }
+ )
+ }
+ }
+)
diff --git a/reduceOverlap.js b/bin/reduceOverlap.js
index 8b24baa..3984296 100755
--- a/reduceOverlap.js
+++ b/bin/reduceOverlap.js
@@ -3,16 +3,17 @@
const fs = require('fs')
const { Readable, Transform, pipeline } = require('stream')
const ndjson = require('ndjson')
+const util = require('util')
-const args = process.argv.slice(2)
+const argv = require('yargs/yargs')(process.argv.slice(2)).argv
-if (args.length < 2) {
+if (argv._.length < 2) {
console.error("Usage: ./reduceOverlap.js input.geojson output.geojson")
process.exit(1)
}
-const inputFile = args[0]
-const outputFile = args[1]
+const inputFile = argv._[0]
+const outputFile = argv._[1]
if (!fs.existsSync(inputFile)) {
console.error(`${inputFile} not found`)
@@ -22,12 +23,22 @@ if (!fs.existsSync(inputFile)) {
let sourceCount = 0
const features = {}
+/**
+ * Index features by geometry. Used as a first pass, so a second pass can easily compare
+ * features with the same geometry.
+ */
const index = new Transform({
readableObjectMode: true,
writableObjectMode: true,
transform(feature, encoding, callback) {
sourceCount++
+ if (!argv.quiet) {
+ if (sourceCount % 10000 === 0) {
+ process.stdout.write(` ${sourceCount / 1000}k\r`)
+ }
+ }
+
const geometryKey = feature.geometry.coordinates.join(',')
if (!(geometryKey in features)) {
@@ -39,22 +50,34 @@ const index = new Transform({
}
})
+/**
+ * Reduces features with the same geometry.
+ */
+let reduceIndex = 0
const reduce = new Transform({
readableObjectMode: true,
writableObjectMode: true,
transform(key, encoding, callback) {
+ reduceIndex++
+ if (!argv.quiet) {
+ if (reduceIndex % 10000 === 0) {
+ process.stdout.write(` ${reduceIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceIndex / sourceCount * 100)}%)\r`)
+ }
+ }
var groupedFeatures = features[key]
+
if (groupedFeatures.length === 1) {
- // point not overlapping
+ // only one feature with this geometry, nothing to reduce, output as is
this.push(groupedFeatures[0])
} else {
- // points overlapping, try to reduce to non-overlapping
+ // mulitple features with the same geometry
- // if housenumber, street, suburb, state, postcode are all the same
+ // if housename, housenumber, street, suburb, state, postcode are all the same
// and it's only unit which differs,
// and there is an address with no unit
// then remove all the unit addresses and add them as addr:flats on the no unit address
+ const sameHousename = [...new Set(groupedFeatures.map(f => f.properties['addr:housename']))].length <= 1
const sameHousenumber = [...new Set(groupedFeatures.map(f => f.properties['addr:housenumber']))].length <= 1
const sameStreet = [...new Set(groupedFeatures.map(f => f.properties['addr:street']))].length <= 1
const sameSuburb = [...new Set(groupedFeatures.map(f => f.properties['addr:suburb']))].length <= 1
@@ -63,11 +86,13 @@ const reduce = new Transform({
const hasNonUnit = groupedFeatures.map(f => 'addr:unit' in f.properties).includes(false)
- if (sameHousenumber && sameStreet && sameSuburb && sameState && samePostcode) {
+ if (sameHousename && sameHousenumber && sameStreet && sameSuburb && sameState && samePostcode) {
if (hasNonUnit) {
const nonUnitFeatures = groupedFeatures.filter(f => (!('addr:unit' in f.properties)))
if (nonUnitFeatures.length > 1) {
// multiple non-unit features, unsure how to reduce
+ console.log('multiple non-unit features, unsure how to reduce')
+ console.dir(groupedFeatures, {depth: null})
} else {
const nonUnitFeature = nonUnitFeatures[0]
@@ -76,6 +101,8 @@ const reduce = new Transform({
// if allOtherUnits.length is one then that means we have one address without a unit and one with a unit at the same point
// TODO should we just drop the non-unit address and keep the addr:unit one?
+ // need to determine if you always have a non-unit address for the unit address, if there is then
+ // perhaps we can safely drop the non-unit address and use a single addr:unit
// adapted from https://stackoverflow.com/a/54973116/6702659
const sortedAllOtherUnitsAsRanges = allOtherUnits
@@ -100,6 +127,9 @@ const reduce = new Transform({
const units = groupedFeatures.filter(f => 'addr:unit' in f.properties).map(f => f.properties['addr:unit'])
// TODO assert units.length > 1
+ if (units.length <= 1) {
+ // console.log(`all have same housenumber, street, suburb, state, postcode but no non-unit, but only found ${units.length} units`, units)
+ }
const feature = groupedFeatures[0]
delete feature.properties['addr:unit']
@@ -122,11 +152,13 @@ const reduce = new Transform({
this.push(feature)
}
} else {
- console.log('addresses with the same geometry, however more than unit differs')
+ // addresses with the same geometry, however more than unit differs
// TODO need to investigate to see what we can/shoud do about these
- //console.log(groupedFeatures)
for (let i = 0; i < groupedFeatures.length; i++) {
this.push(groupedFeatures[i])
+ if (debugSameGeometry) {
+ debugSameGeometry.write(groupedFeatures[i])
+ }
}
}
}
@@ -135,6 +167,11 @@ const reduce = new Transform({
}
})
+const debugSameGeometry = argv.debug ?
+ ndjson.stringify()
+ .pipe(fs.createWriteStream('debug/reduceOverlap/sameGeometry.geojson'))
+ : null
+
// first pass to index by geometry
console.log('First pass to index by geometry')
pipeline(
@@ -158,6 +195,7 @@ pipeline(
console.log(err)
process.exit(1)
} else {
+ debugSameGeometry.end()
process.exit(0)
}
}
diff --git a/vicmap2osm.js b/bin/vicmap2osm.js
index e579d98..b252f24 100755
--- a/vicmap2osm.js
+++ b/bin/vicmap2osm.js
@@ -6,15 +6,24 @@ const ndjson = require('ndjson')
const toOSM = require('./toOSM.js')
const filterOSM = require('./filterOSM.js')
-const args = process.argv.slice(2)
+const argv = require('yargs/yargs')(process.argv.slice(2))
+ .option('debug', {
+ type: 'boolean',
+ description: 'Dumps full debug logs'
+ })
+ .option('tracing', {
+ type: 'boolean',
+ description: 'Includes _pfi tags to aid debugging'
+ })
+ .argv
-if (args.length < 2) {
+if (argv._.length < 2) {
console.error("Usage: ./vicmap2osm.js input.geojson output.geojson")
process.exit(1)
}
-const inputFile = args[0]
-const outputFile = args[1]
+const inputFile = argv._[0]
+const outputFile = argv._[1]
if (!fs.existsSync(inputFile)) {
console.error(`${inputFile} not found`)
@@ -26,10 +35,14 @@ const transform = new Transform({
writableObjectMode: true,
transform(feature, encoding, callback) {
// convert source Feature into a Feature per the OSM schema
- const osm = toOSM(feature)
+ const osm = toOSM(feature, {
+ tracing: argv.tracing
+ })
- // some addresses we skip importing into OSM
- if (filterOSM(osm)) {
+ // some addresses we skip importing into OSM, see README.md#omitted-addresses
+ if (filterOSM(osm, {
+ debug: argv.debug
+ })) {
this.push(osm)
}
diff --git a/filterOSM.js b/filterOSM.js
deleted file mode 100644
index 93db4a7..0000000
--- a/filterOSM.js
+++ /dev/null
@@ -1,10 +0,0 @@
-module.exports = (feature) => {
-
- // skip any addresses without a housenumber
- // eg PFI 53396626 has no housenumber
- if (!('addr:housenumber' in feature.properties)) {
- return false
- }
-
- return true
-}
diff --git a/cluster.js b/lib/cluster.js
index c716063..c716063 100644
--- a/cluster.js
+++ b/lib/cluster.js
diff --git a/lib/filterOSM.js b/lib/filterOSM.js
new file mode 100644
index 0000000..e530773
--- /dev/null
+++ b/lib/filterOSM.js
@@ -0,0 +1,16 @@
+module.exports = (feature, options) => {
+
+ // skip any addresses without either a housenumber or housename
+ // eg PFI 53396626 has no housenumber
+ if (
+ !('addr:housenumber' in feature.properties) &&
+ !('addr:housename' in feature.properties)
+ ) {
+ if (argv.debug) {
+ console.log(`PFI ${feature.properties._pfi} has neither a addr:housename or addr:housenumber, filtering`)
+ }
+ return false
+ }
+
+ return true
+}
diff --git a/toOSM.js b/lib/toOSM.js
index dd993d0..f9fab84 100644
--- a/toOSM.js
+++ b/lib/toOSM.js
@@ -65,12 +65,22 @@ const emptyNames = [
'NOT NAMED'
]
-module.exports = (sourceFeature) => {
+/**
+ * Transforms a GeoJSON Feature from the Vicmap address schema into OSM schema
+ *
+ * @param sourceFeature Feature in Vicmap address schema
+ * @returns Feature in OSM schema
+ */
+module.exports = (sourceFeature, options) => {
const outputFeature = Object.assign({}, sourceFeature)
const sourceProperties = sourceFeature.properties
const outputProperties = {}
+ if (options && options.tracing) {
+ outputProperties['_pfi'] = sourceProperties.PFI
+ }
+
// Building sub address type (eg UNIT OFFICE SHOP)
//
// bld_unit_*
@@ -104,9 +114,11 @@ module.exports = (sourceFeature) => {
outputProperties['addr:unit'] = bld_unit
}
+ /*
if (sourceProperties.BLGUNTTYP && sourceProperties.BLGUNTTYP in buildingUnitType) {
outputProperties['addr:unit:type'] = buildingUnitType[sourceProperties.BLGUNTTYP]
}
+ */
if (sourceProperties.BUILDING) {
outputProperties['addr:housename'] = sourceProperties.BUILDING
diff --git a/src/createIndexQuery.sql b/src/createIndexQuery.sql
new file mode 100644
index 0000000..a4e2712
--- /dev/null
+++ b/src/createIndexQuery.sql
@@ -0,0 +1,5 @@
+SELECT
+ 'CREATE INDEX CONCURRENTLY ' || table_name || '_' || column_name || ' ON ' || table_name || ' ("' || column_name || '");'
+FROM information_schema.columns
+WHERE table_schema = 'public'
+ AND table_name = 'vmadd';