aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbin/reduceRangeDuplicates.js59
-rw-r--r--package.json3
-rw-r--r--test/reduceRangeDuplicates.js81
-rw-r--r--yarn.lock5
4 files changed, 135 insertions, 13 deletions
diff --git a/bin/reduceRangeDuplicates.js b/bin/reduceRangeDuplicates.js
index 32155a4..b719124 100755
--- a/bin/reduceRangeDuplicates.js
+++ b/bin/reduceRangeDuplicates.js
@@ -22,6 +22,10 @@ const argv = require('yargs/yargs')(process.argv.slice(2))
type: 'boolean',
description: 'Dumps full debug logs'
})
+ .option('verbose', {
+ type: 'boolean',
+ description: 'Verbose logging'
+ })
.argv
if (argv._.length < 2) {
@@ -37,10 +41,21 @@ if (!fs.existsSync(inputFile)) {
process.exit(1)
}
+function hash(feature) {
+ return [
+ feature.properties['addr:housenumber'],
+ feature.properties['addr:street'],
+ feature.properties['addr:suburb'],
+ feature.properties['addr:state'],
+ feature.properties['addr:postcode']
+ ].join('/')
+}
+
let sourceCount = 0
const ranges = []
const nonRangesByStreet = {}
+const rangesRemovedInFilterA = {}
// index all non-range addresses by street, suburb, state, postcode
const index = new Transform({
@@ -78,7 +93,7 @@ const index = new Transform({
const regexp = /^(?<pre>\D*)(?<num>\d*)(?<suf>\D*)$/
/*
-* First pass removes ranges where each endpoint of the range exists seperatly
+* second pass, filter A removes ranges where each endpoint of the range exists separately
* eg.
* - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
* - 304 Cardigan Street Calton
@@ -98,9 +113,8 @@ const reduceRange = new Transform({
}
const isRange = feature.properties['addr:housenumber'].split('-').length > 1
-
if (isRange) {
- // see if it can be removed when each end point of the range is included seperatly
+ // see if it can be removed when each end point of the range is included separately
const start = feature.properties['addr:housenumber'].split('-')[0]
const end = feature.properties['addr:housenumber'].split('-')[1]
@@ -123,8 +137,8 @@ const reduceRange = new Transform({
let pre = ''
let suf = ''
- matchCandidates.map(matchCandidate => {
- if (start === matchCandidate.properties['addr:housenumber']) {
+ for (const matchCandidate of matchCandidates) {
+ if (!foundStart && start === matchCandidate.properties['addr:housenumber']) {
foundStart = true
const match = start.match(regexp)
@@ -132,13 +146,18 @@ const reduceRange = new Transform({
pre = match.groups.pre
suf = match.groups.suf
}
- if (end === matchCandidate.properties['addr:housenumber']) {
+ if (!foundEnd && end === matchCandidate.properties['addr:housenumber']) {
foundEnd = true
const match = end.match(regexp)
endNum = match.groups.num
}
- })
+
+ if (foundStart && foundEnd) {
+ // stop early
+ break
+ }
+ }
if (foundStart && foundEnd) {
// found both start and end
@@ -160,10 +179,18 @@ const reduceRange = new Transform({
if (!foundAllIntermediates) {
// some intermediates were missing
// but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results
- console.log('found endpoints but some intermediates are missing', feature)
+ if (argv.verbose) {
+ console.log('Filter A: Found endpoints but some intermediates are missing', feature)
+ }
}
// can be removed, feature not pushed
+ if (argv.verbose) {
+ console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`)
+ }
+
+ // keep track of removed features for filter B, so we don't double remove both range and midpoints
+ rangesRemovedInFilterA[hash(feature)] = true
} else {
// since not both start and end found, then still include the range
this.push(feature)
@@ -183,7 +210,7 @@ const reduceRange = new Transform({
})
/*
-* Second pass removes ane non-range elements where the range exists, and wasn't removed from the first pass
+* Second pass, filter B removes any non-range elements where the range exists, and wasn't removed from the first pass
* eg.
* - 249-263 Faraday Street
* - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
@@ -201,11 +228,12 @@ const reduceNonRange = new Transform({
const isRange = feature.properties['addr:housenumber'].split('-').length > 1
if (!isRange) {
- // not a range, ahall be removed removed when this non-range exists within a range, but the range wasn't removed already
+ // not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already
let dropFeature = false
for (let i = 0; i < ranges.length; i++) {
const range = ranges[i]
- if (withinRange(feature, range)) {
+ // if the range wasn't just removed in filter A, and the feature is within the range
+ if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) {
// found within a range, drop feature unless would drop addr:unit information
if ('addr:unit' in feature.properties) {
// safe to drop if the same addr:unit is also on the range
@@ -227,7 +255,13 @@ const reduceNonRange = new Transform({
}
if (!dropFeature) {
this.push(feature)
+ } else {
+ if (argv.verbose) {
+ console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`)
+ }
}
+ } else {
+ this.push(feature)
}
callback()
@@ -257,7 +291,8 @@ pipeline(
console.log(err)
process.exit(1)
} else {
- // second pass to reduce overlapping features
+ console.log('Second pass to remove range duplicates')
+ // second pass to remove range duplicates
pipeline(
fs.createReadStream(inputFile),
ndjson.parse(),
diff --git a/package.json b/package.json
index 388427e..f13575e 100644
--- a/package.json
+++ b/package.json
@@ -6,7 +6,7 @@
"author": "Andrew Harvey <andrew@alantgeo.com.au>",
"license": "MIT",
"scripts": {
- "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js test/valueLimits.js"
+ "test": "./node_modules/.bin/tape test/toOSM.js test/cluster.js test/unitsToRanges.js test/withinRange.js test/valueLimits.js test/reduceRangeDuplicates.js"
},
"dependencies": {
"capital-case": "^1.0.4",
@@ -14,6 +14,7 @@
"clone-deep": "^4.0.1",
"flatbush": "^3.3.0",
"geoflatbush": "^1.0.0",
+ "mktemp": "^1.0.0",
"ndjson": "^2.0.0",
"readable-stream": "^3.6.0",
"tape": "^5.2.2",
diff --git a/test/reduceRangeDuplicates.js b/test/reduceRangeDuplicates.js
new file mode 100644
index 0000000..e180841
--- /dev/null
+++ b/test/reduceRangeDuplicates.js
@@ -0,0 +1,81 @@
+const test = require('tape')
+const fs = require('fs')
+const child_process = require('child_process')
+const mktemp = require('mktemp')
+
+function createFeature(housenumber, street, suburb) {
+ return {
+ type: 'Feature',
+ properties: {
+ 'addr:housenumber': housenumber,
+ 'addr:street': street,
+ 'addr:suburb': suburb,
+ 'addr:state': 'VIC',
+ 'addr:postcode': '0000'
+ },
+ geometry: null
+ }
+}
+
+test('reduceRangeDuplicates', t => {
+ const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
+ const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
+ const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
+
+ const AB = createFeature('304-306', 'Cardigan Street', 'Carlton')
+ const A = createFeature('304', 'Cardigan Street', 'Carlton')
+ const B = createFeature('306', 'Cardigan Street', 'Carlton')
+
+ // all three features to appear in input
+ fs.appendFileSync(inputFile, JSON.stringify(AB) + '\n')
+ fs.appendFileSync(inputFile, JSON.stringify(A) + '\n')
+ fs.appendFileSync(inputFile, JSON.stringify(B) + '\n')
+
+ // output expected to just be endpoints, dropping the range
+ fs.appendFileSync(expectedFile, JSON.stringify(A) + '\n')
+ fs.appendFileSync(expectedFile, JSON.stringify(B) + '\n')
+
+ child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+
+ t.same(
+ fs.readFileSync(outputFile),
+ fs.readFileSync(expectedFile),
+ 'range with endpoints appearing separately, drops range'
+ )
+
+ fs.unlinkSync(inputFile)
+ fs.unlinkSync(outputFile)
+ fs.unlinkSync(expectedFile)
+
+ t.end()
+})
+
+test('reduceRangeDuplicates', t => {
+ const inputFile = mktemp.createFileSync('/tmp/input_XXXXX.geojson')
+ const outputFile = mktemp.createFileSync('/tmp/output_XXXXX.geojson')
+ const expectedFile = mktemp.createFileSync('/tmp/expected_XXXXX.geojson')
+
+ const AC = createFeature('249-263', 'Faraday Street', 'Carlton')
+ const B = createFeature('251', 'Faraday Street', 'Carlton')
+
+ // both features to appear in input
+ fs.appendFileSync(inputFile, JSON.stringify(AC) + '\n')
+ fs.appendFileSync(inputFile, JSON.stringify(B) + '\n')
+
+ // output expected to just be range, dropping the midpoint
+ fs.appendFileSync(expectedFile, JSON.stringify(AC) + '\n')
+
+ child_process.execSync(`./bin/reduceRangeDuplicates.js ${inputFile} ${outputFile}`)
+
+ t.same(
+ fs.readFileSync(outputFile),
+ fs.readFileSync(expectedFile),
+ 'range with lone midpoint, drops midpoint'
+ )
+
+ fs.unlinkSync(inputFile)
+ fs.unlinkSync(outputFile)
+ fs.unlinkSync(expectedFile)
+
+ t.end()
+})
diff --git a/yarn.lock b/yarn.lock
index 1b0b2f3..d29620d 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -436,6 +436,11 @@ minimist@^1.2.5:
resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602"
integrity sha512-FM9nNUYrRBAELZQT3xeZQ7fmMOBg6nWNmJKTcgsJeaLstP/UODVpGsr5OhXhhXg6f+qtJ8uiZ+PUxkDWcgIXLw==
+mktemp@^1.0.0:
+ version "1.0.0"
+ resolved "https://registry.yarnpkg.com/mktemp/-/mktemp-1.0.0.tgz#b670eff23f52d6529e1dc362cb74ddf85448a9e3"
+ integrity sha512-2duBeS0A75x0M3sCoY0R1TiLsYfIBUtNBNWS++eo+bX/ObVqzblqnEQhlaepoBOLD14wklsV3cYxZ68o5qYO8A==
+
ndjson@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/ndjson/-/ndjson-2.0.0.tgz#320ac86f6fe53f5681897349b86ac6f43bfa3a19"