aboutsummaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorAndrew Harvey <andrew@alantgeo.com.au>2021-05-18 16:07:17 +1000
committerAndrew Harvey <andrew@alantgeo.com.au>2021-05-18 16:07:17 +1000
commiteb57ce6fdec3a959288b2a30c499dcea7e81a444 (patch)
tree1dba7a1dcf8c11f334fd655b2f6cab67a0aff427 /bin
parent27ef54718487c558182fc73609a649a1adcf9b26 (diff)
improve performance by indexing by street
Diffstat (limited to 'bin')
-rwxr-xr-xbin/reduceDuplicates.js7
-rwxr-xr-xbin/reduceOverlap.js5
-rwxr-xr-xbin/reduceRangeDuplicates.js80
3 files changed, 53 insertions, 39 deletions
diff --git a/bin/reduceDuplicates.js b/bin/reduceDuplicates.js
index e2ba562..985a0e1 100755
--- a/bin/reduceDuplicates.js
+++ b/bin/reduceDuplicates.js
@@ -185,7 +185,7 @@ if (argv.debug) {
}
// first pass to index by geometry
-console.log('First pass to index by address properties')
+console.log('Pass 1/2: index by address properties')
pipeline(
fs.createReadStream(inputFile),
ndjson.parse(),
@@ -195,8 +195,9 @@ pipeline(
console.log(err)
process.exit(1)
} else {
- console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique addresses`)
- // second pass to reduce overlapping features
+ console.log(` of ${sourceCount.toLocaleString()} features found ${Object.keys(features).length.toLocaleString()} unique addresses`)
+ // second pass to reduce duplicate features
+ console.log('Pass 2/2: reduce duplicate features')
pipeline(
Readable.from(Object.keys(features)),
reduce,
diff --git a/bin/reduceOverlap.js b/bin/reduceOverlap.js
index 2255368..ae5f1dc 100755
--- a/bin/reduceOverlap.js
+++ b/bin/reduceOverlap.js
@@ -206,7 +206,7 @@ if (argv.debug) {
}
// first pass to index by geometry
-console.log('First pass to index by geometry')
+console.log('Pass 1/2: index by geometry')
pipeline(
fs.createReadStream(inputFile),
ndjson.parse(),
@@ -216,8 +216,9 @@ pipeline(
console.log(err)
process.exit(1)
} else {
- console.log(` of ${sourceCount} features found ${Object.keys(features).length} unique geometries`)
+ console.log(` of ${sourceCount.toLocaleString()} features found ${Object.keys(features).length.toLocaleString()} unique geometries`)
// second pass to reduce overlapping features
+ console.log('Pass 2/2: reduce overlapping features')
pipeline(
Readable.from(Object.keys(features)),
reduce,
diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js
index 093d9bc..1aa232c 100755
--- a/bin/reduceRangeDuplicates.js
+++ b/bin/reduceRangeDuplicates.js
@@ -53,7 +53,7 @@ function hash(feature) {
let sourceCount = 0
-const ranges = []
+const rangesByStreet = {}
const nonRangesByStreet = {}
const rangesRemovedInFilterA = {}
@@ -70,16 +70,18 @@ const index = new Transform({
const isRange = feature.properties['addr:housenumber'].split('-').length > 1
+ const key = [
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
if (isRange) {
- ranges.push(feature)
+ if (!(key in rangesByStreet)) {
+ rangesByStreet[key] = []
+ }
+ rangesByStreet[key].push(feature)
} else {
- const key = [
- feature.properties['addr:street'],
- feature.properties['addr:suburb'],
- feature.properties['addr:state'],
- feature.properties['addr:postcode']
- ].join('/')
-
if (!(key in nonRangesByStreet)) {
nonRangesByStreet[key] = []
}
@@ -229,32 +231,42 @@ const reduceNonRange = new Transform({
if (!isRange) {
// not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already
+
+ const key = [
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
+
let dropFeature = false
- for (let i = 0; i < ranges.length; i++) {
- const range = ranges[i]
- // if the range wasn't just removed in filter A, and the feature is within the range
- if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) {
- // found within a range, drop feature unless would drop addr:unit or addr:flats information
- if ('addr:unit' in feature.properties || 'addr:flats' in feature.properties) {
- // safe to drop if the same addr:unit and addr:flats is also on the range
- if (
- 'addr:unit' in feature.properties ? ('addr:unit' in range.properties && feature.properties['addr:unit'] === range.properties['addr:unit']) : true &&
- 'addr:flats' in feature.properties ? ('addr:flats' in range.properties && feature.properties['addr:flats'] === range.properties['addr:flats']) : true
- ) {
- dropFeature = true
- } else {
- // since the non-range feature has a unit that the range doesn't have, don't drop it
- dropFeature = false
- if (argv.debug) {
- debugStreams['addrInRangeDifferentUnits'].write(feature)
- debugStreams['addrInRangeDifferentUnits'].write(range)
+ if (key in rangesByStreet) {
+ for (let i = 0; i < rangesByStreet[key].length; i++) {
+ const range = rangesByStreet[key][i]
+ // if the range wasn't just removed in filter A, and the feature is within the range
+ if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) {
+ // found within a range, drop feature unless would drop addr:unit or addr:flats information
+ if ('addr:unit' in feature.properties || 'addr:flats' in feature.properties) {
+ // safe to drop if the same addr:unit and addr:flats is also on the range
+ if (
+ 'addr:unit' in feature.properties ? ('addr:unit' in range.properties && feature.properties['addr:unit'] === range.properties['addr:unit']) : true &&
+ 'addr:flats' in feature.properties ? ('addr:flats' in range.properties && feature.properties['addr:flats'] === range.properties['addr:flats']) : true
+ ) {
+ dropFeature = true
+ } else {
+ // since the non-range feature has a unit that the range doesn't have, don't drop it
+ dropFeature = false
+ if (argv.debug) {
+ debugStreams['addrInRangeDifferentUnits'].write(feature)
+ debugStreams['addrInRangeDifferentUnits'].write(range)
+ }
}
- }
- } else {
- // no addr:unit or addr:flats on the feature to safe to drop
- dropFeature = true
+ } else {
+ // no addr:unit or addr:flats on the feature to safe to drop
+ dropFeature = true
+ }
+ break
}
- break
}
}
if (!dropFeature) {
@@ -285,7 +297,7 @@ if (argv.debug) {
}
// first pass to index by geometry
-console.log('First pass to index non-ranges by street,suburb,state,postcode properties')
+console.log('Pass 1/2: index non-ranges by street,suburb,state,postcode properties')
pipeline(
fs.createReadStream(inputFile),
ndjson.parse(),
@@ -295,8 +307,8 @@ pipeline(
console.log(err)
process.exit(1)
} else {
- console.log('Second pass to remove range duplicates')
// second pass to remove range duplicates
+ console.log('Pass 2/2: remove range duplicates')
pipeline(
fs.createReadStream(inputFile),
ndjson.parse(),