#!/usr/bin/env node /** * Remove duplicates created by addresses from a range also appearing individually * eg. * - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists * - 304 Cardigan Street Calton * - 306 Cardigan Street Calton * * - 249-263 Faraday Street * - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range * */ const fs = require('fs') const { Transform, pipeline } = require('stream') const ndjson = require('ndjson') const withinRange = require('../lib/withinRange.js') const argv = require('yargs/yargs')(process.argv.slice(2)) .option('debug', { type: 'boolean', description: 'Dumps full debug logs' }) .option('verbose', { type: 'boolean', description: 'Verbose logging' }) .argv if (argv._.length < 2) { console.error("Usage: ./reduceRangeDuplicates.js input.geojson output.geojson") process.exit(1) } const inputFile = argv._[0] const outputFile = argv._[1] if (!fs.existsSync(inputFile)) { console.error(`${inputFile} not found`) process.exit(1) } function hash(feature) { return [ feature.properties['addr:housenumber'], feature.properties['addr:street'], feature.properties['addr:suburb'], feature.properties['addr:state'], feature.properties['addr:postcode'] ].join('/') } let sourceCount = 0 const ranges = [] const nonRangesByStreet = {} const rangesRemovedInFilterA = {} // index all non-range addresses by street, suburb, state, postcode const index = new Transform({ readableObjectMode: true, writableObjectMode: true, transform(feature, encoding, callback) { sourceCount++ if (sourceCount % 10000 === 0) { process.stdout.write(` ${sourceCount / 1000}k\r`) } const isRange = feature.properties['addr:housenumber'].split('-').length > 1 if (isRange) { ranges.push(feature) } else { const key = [ feature.properties['addr:street'], feature.properties['addr:suburb'], feature.properties['addr:state'], feature.properties['addr:postcode'] ].join('/') if (!(key in nonRangesByStreet)) { nonRangesByStreet[key] = [] } nonRangesByStreet[key].push(feature) } callback() } }) const regexp = /^(?
\D*)(?\d*)(?\D*)$/

/*
* second pass, filter A removes ranges where each endpoint of the range exists separately
* eg.
*  - 304-306 Cardigan Street Carlton - range can be removed since each individual address exists
*  - 304 Cardigan Street Calton
*  - 306 Cardigan Street Calton
* 
*  - 249-263 Faraday Street
*  - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
*/
let reduceRangeIndex = 0
const reduceRange = new Transform({
  readableObjectMode: true,
  writableObjectMode: true,
  transform(feature, encoding, callback) {
    reduceRangeIndex++
    if (reduceRangeIndex % 10000 === 0) {
      process.stdout.write(` ${reduceRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceRangeIndex / sourceCount * 100)}%)\r`)
    }
    
    const isRange = feature.properties['addr:housenumber'].split('-').length > 1
    if (isRange) {
      // see if it can be removed when each end point of the range is included separately
      const start = feature.properties['addr:housenumber'].split('-')[0]
      const end = feature.properties['addr:housenumber'].split('-')[1]

      const key = [
        feature.properties['addr:street'],
        feature.properties['addr:suburb'],
        feature.properties['addr:state'],
        feature.properties['addr:postcode']
      ].join('/')

      // find nonRange addresses on the same street
      if (key in nonRangesByStreet) {
        const matchCandidates = nonRangesByStreet[key]

        let foundStart = false
        let foundEnd = false

        let startNum
        let endNum
        let pre = ''
        let suf = ''

        for (const matchCandidate of matchCandidates) {
          if (!foundStart && start === matchCandidate.properties['addr:housenumber']) {
            foundStart = true

            const match = start.match(regexp)
            startNum = match.groups.num
            pre = match.groups.pre
            suf = match.groups.suf
          }
          if (!foundEnd && end === matchCandidate.properties['addr:housenumber']) {
            foundEnd = true

            const match = end.match(regexp)
            endNum = match.groups.num
          }

          if (foundStart && foundEnd) {
            // stop early
            break
          }
        }

        if (foundStart && foundEnd) {
          // found both start and end

          // see if any intermediates are missing
          const foundAllIntermediates = true
          for (let i = (startNum + 2); i <= (endNum - 2) && foundAllIntermediates === true; i += 2) {
            let foundIntermediate = false
            matchCandidates.map(matchCandidate => {
              if (`${pre}${i}${suf}` === matchCandidate.properties['addr:housenumber']) {
                foundIntermediate = true
              }
            })

            if (foundIntermediate === false) {
              foundAllIntermediates = false
            }
          }
          if (!foundAllIntermediates) {
            // some intermediates were missing
            // but we'll pretend that's okay and let the geocoding algorithm use it's own interpolation to still find results
            if (argv.verbose) {
              console.log('Filter A: Found endpoints but some intermediates are missing', feature)
            }
          }

          // can be removed, feature not pushed
          if (argv.verbose) {
            console.log(`Filter A: ${feature.properties['addr:housenumber']} can be removed`)
          }

          // keep track of removed features for filter B, so we don't double remove both range and midpoints
          rangesRemovedInFilterA[hash(feature)] = true
        } else {
          // since not both start and end found, then still include the range
          this.push(feature)
        }
      } else {
        // there are no non-ranges on this street so still include the range
        this.push(feature)
      }
    } else {
      // else, not a range, we will see if it can be removed in a second pass
      // shall be removed removed when this non-range exists within a range, but the range wasn't removed already
      this.push(feature)
    }

    callback()
  }
})

/*
* Second pass, filter B removes any non-range elements where the range exists, and wasn't removed from the first pass
* eg.
*  - 249-263 Faraday Street
*  - 251 Faraday Street - removed since not all addresses from the range exist, but this one is covered by the range
*/
let reduceNonRangeIndex = 0
const reduceNonRange = new Transform({
  readableObjectMode: true,
  writableObjectMode: true,
  transform(feature, encoding, callback) {
    reduceNonRangeIndex++
    if (reduceNonRangeIndex % 10000 === 0) {
      process.stdout.write(` ${reduceNonRangeIndex / 1000}k / ${Math.round(sourceCount / 1000)}k (${Math.round(reduceNonRangeIndex / sourceCount * 100)}%)\r`)
    }
    
    const isRange = feature.properties['addr:housenumber'].split('-').length > 1

    if (!isRange) {
      // not a range, shall be removed where this non-range exists within a range, but the range wasn't removed already
      let dropFeature = false
      for (let i = 0; i < ranges.length; i++) {
        const range = ranges[i]
        // if the range wasn't just removed in filter A, and the feature is within the range
        if (!(hash(range) in rangesRemovedInFilterA) && withinRange(feature, range)) {
          // found within a range, drop feature unless would drop addr:unit information
          if ('addr:unit' in feature.properties) {
            // safe to drop if the same addr:unit is also on the range
            if ('addr:unit' in range.properties &&
              feature.properties['addr:unit'] === range.properties['addr:unit']) {
                dropFeature = true
              } else {
                // since the non-range feature has a unit that the range doesn't have, don't drop it
                dropFeature = false
                if (argv.debug) {
                  debugStreams['addrInRangeDifferentUnits'].write(feature)
                  debugStreams['addrInRangeDifferentUnits'].write(range)
                }
              }
          } else {
            // no addr:unit on the feature to safe to drop
            dropFeature = true
          }
          break
        }
      }
      if (!dropFeature) {
        this.push(feature)
      } else {
        if (argv.verbose) {
          console.log(`Filter B: Dropping ${feature.properties['addr:housenumber']}`)
        }
      }
    } else {
      this.push(feature)
    }

    callback()
  }
})

// ndjson streams to output debug features
const debugKeys = ['addrInRangeDifferentUnits']
const debugStreams = {}
const debugStreamOutputs = {}

if (argv.debug) {
  debugKeys.forEach(key => {
    debugStreams[key] = ndjson.stringify()
    debugStreamOutputs[key] = debugStreams[key].pipe(fs.createWriteStream(`debug/reduceRangeDuplicates/${key}.geojson`))
  })
}

// first pass to index by geometry
console.log('First pass to index non-ranges by street,suburb,state,postcode properties')
pipeline(
  fs.createReadStream(inputFile),
  ndjson.parse(),
  index,
  err => {
    if (err) {
      console.log(err)
      process.exit(1)
    } else {
      console.log('Second pass to remove range duplicates')
      // second pass to remove range duplicates
      pipeline(
        fs.createReadStream(inputFile),
        ndjson.parse(),
        reduceRange,
        reduceNonRange,
        ndjson.stringify(),
        fs.createWriteStream(outputFile),
        err => {
          if (err) {
            console.log(err)
            process.exit(1)
          } else {
            if (argv.debug) {
              debugKeys.forEach(key => {
                debugStreams[key].end()
              })

              Promise.all(debugKeys.map(key => {
                return new Promise(resolve => {
                  debugStreamOutputs[key].on('finish', () => {
                    console.log(`saved debug/reduceRangeDuplicates/${key}.geojson`)
                    resolve()
                  })
                })
              }))
                .then(() => {
                  process.exit(0)
                })
            } else {
              process.exit(0)
            }
          }
        }
      )
    }
  }
)