aboutsummaryrefslogtreecommitdiff
path: root/export.py
diff options
context:
space:
mode:
Diffstat (limited to 'export.py')
-rw-r--r--export.py419
1 files changed, 0 insertions, 419 deletions
diff --git a/export.py b/export.py
deleted file mode 100644
index bf45e5e..0000000
--- a/export.py
+++ /dev/null
@@ -1,419 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import datetime
-import itertools
-import json
-import os
-import re
-import sys
-import textwrap
-
-import pandas as pd
-
-import license
-
-# Package fields which refer to download locations.
-download_keys = """
- Origin
- Original-Source
- Source
- Source-Code
- X-Origin
- X-Original-Package
- X-Source
- """
-
-def concat(xss):
- all = []
-
- for xs in xss:
- all.extend(xs)
-
- return all
-
-def indent(s):
- return ''.join(
- ' %s\n' % line if line else '\n'
- for line in s.splitlines())
-
-def today():
- return datetime.datetime.now().strftime('%Y-%m-%d')
-
-def warn(*x):
- for s in ('warning:',) + x + ('\n',):
- print >>sys.stderr, s,
-
-class ExportFailure(Exception):
- pass
-
-class PkgData(object):
- def __init__(self):
- pkg_store = pd.HDFStore('pkg.h5')
- self.pkgs = pkg_store['packages']
- self.descs = pkg_store['descriptions']
- self.srcs = pkg_store['sources']
- pkg_store.close()
-
- cp_store = pd.HDFStore('cp.h5')
- self.cpf = cp_store['cp_files']
- self.cps = cp_store['cp_summary']
- self.licenses = cp_store['licenses']
- cp_store.close()
-
- cl_store = pd.HDFStore('cl.h5')
- self.cl = cl_store['cl_versions']
- cl_store.close()
-
-def nouni(s):
- return s.encode('utf8') if isinstance(s, unicode) else s
-
-class Template(object):
- def __init__(self, name, values):
- self.name = name
- self.values = values
-
- def __str__(self):
- return '{{%s\n%s\n}}' % (
- nouni(self.name),
- '\n'.join(['|' + '%s=%s' %
- (nouni(n), nouni(v))
- for (n, v) in self.values]))
-
-def parse_tags(s):
- return s.replace('\n', '').split(', ')
-
-def extract_languages(tags):
- langs = []
-
- for tag in tags:
- (a, b) = tag.split('::')
-
- if a == 'implemented-in':
- langs.append(b)
- elif a == 'devel' and b.startswith('lang:'):
- langs.append(b.split(':')[1])
-
- return list(set(langs))
-
-def catechise(s):
- heresies = ["open source", "debian", "(?<!gnu/)linux", "creative commons"]
- pattern = '\\b(%s)\\b' % '|'.join([h.replace(' ', '.') for h in heresies])
- return re.sub(pattern,
- lambda m: '??%s??' % m.group(1).replace('\n', ' '),
- s,
- re.DOTALL | re.IGNORECASE)
-
-def munge_description(s):
- paras = s.split('\n .\n')
- return '\n\n'.join(
- textwrap.fill(para.lstrip().replace('\n', ''), 65)
- for para in paras)
-
-def get_license_map():
- map = {}
-
- for para in file('license_map').read().split('\n\n'):
- if not para:
- continue
-
- match = re.match('\[([^\]]+)\]', para)
- assert match, para
- canonical = match.group(1)
- aliases = para[match.end():].lstrip().splitlines()
-
- for alias in aliases:
- map[alias] = canonical
-
- return map
-
-def srcpkg_extract_licenses(header, filess, licenses, cl_date, cl_uploader):
- # XXX: generate template from header stanza
- # XXX: flag CC licenses
- # XXX: check all License stanzas were included
- # XXX: exclude licenses for Files: debian/*
- lmap = get_license_map()
- by_name = dict([
- (s['_license'],
- s['License'].split('\n', 1)[1]
- if '\n' in s['License']
- else s['License'])
- for (_idx, s) in licenses.iterrows()])
-
- for (_ix, files) in filess.iterrows():
- ldesc = files['_license'].strip().lower()
- ltext = files['License']
-
- if '\n' in ltext:
- # Looks like license text is included directly.
- ltext = munge_description(ltext)
- txt = 'License: %s\n\n%s' % (ldesc, ltext)
- elif ldesc in by_name:
- # License information is a stub. Try to find the corresponding
- # text(s).
-
- ltext = munge_description(by_name[ldesc])
- txt = 'License: %s\n\n%s' % (ldesc, ltext)
- else:
- parsed = license.parse_licenses(ldesc)
- lnames = list(parsed.flatten())
- missing = set(lnames) - set(by_name.keys())
-
- if missing:
- txt = 'License: %s' % (parsed)
-
- canon = lmap.get(ldesc.lower(), 'Other')
- # XXX: Should maybe bail if there's no copyright field.
- cp = ''.join(
- u'Copyright %s' % line.lstrip()
- for line in files.dropna().get('Copyright', '').splitlines())
- cp = cp.encode('utf8')
- txt = txt.encode('utf8')
-
- yield Template('Project license', [
- ('License', canon),
- ('License copyright', cp),
- ('License verified by', 'Debian: %s' % cl_uploader),
- ('License verified date', cl_date),
- ('License note', txt)])
-
-def parse_person(s):
- match = re.match('([^<]+)\s+<([^>]+)>', s)
-
- if match:
- return (match.group(1), match.group(2))
- else:
- return (s, '')
-
-def extract_people(df):
- # XXX: extract contributors, maintainers
- df = df.dropna()
-
- if 'Upstream-Contact' in df:
- (name, email) = parse_person(df['Upstream-Contact'])
- yield Template('Person', [
- ('Real name', name),
- ('Role', 'contact'),
- ('Email', email)])
-
-def extract_resources(cp_header):
- cp_header = cp_header.dropna()
-
- for key in re.findall('\S+', download_keys):
- if key in cp_header:
- yield Template('Resource', [
- ('Resource kind', 'Download'),
- ('Resource URL', cp_header[key])])
-
-def export_srcpkgs(data, name, srcpkg_names):
- """Export a package by reference to its constituent source packages.
-
- This coordinates all the information that goes into a particular page.
- """
-
- # Map source package names to binary packages, and also make note
- # of which versions of those source packages we're looking at.
- binpkgs = pd.concat([
- data.pkgs[data.pkgs['_srcpkg'] == srcpkg]
- for srcpkg in srcpkg_names])
- versions = {}
-
- srcfiles=""
- for srcpkg in srcpkg_names:
- srcfile = data.srcs[data.srcs['_srcpkg'] == srcpkg]['srcfile'].values[0]
- letter = srcfile[0]
- if srcfile[:3] == 'lib':
- letter = srcfile[:4]
- srcfile = 'http://ftp.debian.org/debian/pool/main/%s/%s/%s' % (letter,srcpkg,srcfile)
- srcfiles = srcfiles + srcfile + " "
-
- for (_i, pkg) in binpkgs.iterrows():
- versions[pkg['_srcpkg']] = pkg['Version']
-
- if len(binpkgs) == 0:
- warn('no binary packages found for', srcpkg_names)
- return
-
- binpkg_names = sorted(binpkgs['Package'], key=len)
- homepages = list(binpkgs['Homepage'])
- # XXX: maybe choose the one that appears the most?
- homepage = homepages[0] if homepages else ''
- tags = set(concat(
- [parse_tags(t) for t in binpkgs['Tag'] if not pd.isnull(t)]))
- langs = [s.title() for s in extract_languages(tags)]
-
- if name in binpkg_names:
- descpkg = name
- else:
- # Heuristic: choose the package with the shortest name.
- # We could try to do something smarter, like look for the common
- # prefix of the descriptions of all the binary packages.
- descpkg = binpkg_names[0]
- try:
- desc = list(data.descs[
- data.descs['Package'] == descpkg]['Description-en'])[0]
- (short_desc, full_desc) = desc.split('\n', 1)
- except:
- full_desc = ''
- short_desc = ''
- full_desc = catechise(munge_description(full_desc))
-
- yield Template('Entry', [
- ('Name', name.capitalize()),
- ('Short description', short_desc),
- ('Full description', full_desc),
- ('Homepage URL', homepage),
- ('User level', ''),
- # XXX get this information from apt-file
- ('Component programs', ''),
- ('VCS checkout command', ''),
- ('Computer languages', ', '.join(langs)),
- ('Status', ''),
- ('Is GNU', 'No'),
- ('Version identifier', pkg['Version']),
- ('Version download', srcfiles),
- ('Submitted by', 'Debian import'),
- ('Submitted date', today())])
-
- yield Template('Import', [
- ('Source', 'Debian'),
- ('Source link',
- 'http://packages.debian.org/sid/' + srcpkg_names[0]),
- ('Source packages',
- ', '.join('%s %s' % (k, v) for (k, v) in versions.iteritems())),
- ('Date', today())])
-
- people = []
- res = []
-
- for srcpkg in srcpkg_names:
- pkg_cps = data.cps[data.cps['_srcpkg'] == srcpkg].ix[0]
- pkg_cpf = data.cpf[data.cpf['_srcpkg'] == srcpkg]
- pkg_licenses = data.licenses[data.licenses['_srcpkg'] == srcpkg]
- people.extend(list(extract_people(pkg_cps)))
- res.extend(list(extract_resources(pkg_cps)))
-
- pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg]
- cl_date = pkg_cl['date']
- # avoid rare bad index error:
- if cl_date.empty:
- continue
- cl_date = cl_date[0]
- cl_uploader = pkg_cl['author'][0]
- for template in srcpkg_extract_licenses(
- pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader):
- # XXX: eliminate duplicates
- yield template
-
- for template in people:
- # XXX: eliminate duplicates
- yield template
-
- for template in res:
- # XXX: eliminate duplicates
- yield template
-
- #yield Template('Software category', [
- # ('Resource kind', ''),
- # ('Resource URL', '')])
-
-def filename(s):
- s_ = re.sub('[^A-Za-z0-9_+.-]', '_', s)
- assert s_, s
- return s_ + '.wiki'
-
-def output(path, xs):
- with open(path, 'w') as f:
- for x in xs:
- f.write(str(x) + '\n')
-
-def output_multi(path, xs):
- "Output a bunch of pages to a directory with an index."
-
- index = {}
-
- if not os.path.exists(path):
- os.makedirs(path)
-
- for (name, templates) in xs:
- fname = filename(name)
- fpath = os.path.join(path, fname)
- index[fname] = {'page': name, 'file': fname}
- output(fpath, templates)
-
- fpath = os.path.join(path, 'index.json')
- json.dump(index, file(fpath, 'w'))
-
-def uname_srcpkgs(data, name):
- pkg_cps = data.cps[data.cps['Upstream-Name'] == name]
- srcpkg_names = list(pkg_cps['_srcpkg'])
- return srcpkg_names
-
-def export_all(data):
- """Export all packages.
-
- Returns a generator of (name, templates) tuples.
- """
-
- # First, find all upstream names and the source packages corresponding
- # to them.
-
- unames = sorted(set(data.cps['Upstream-Name'].dropna()))
-
- # For source packages with no upstream name, use the source package
- # name as the upstream name.
-
- no_uname = sorted(set(data.cps[
- data.cps['Upstream-Name'].isnull()]['_srcpkg']))
-
- packages = itertools.chain(
- ((uname, uname_srcpkgs(data, uname)) for uname in unames),
- ((srcpkg, [srcpkg]) for srcpkg in no_uname))
-
- for (name, srcpkgs) in packages:
- if not name:
- continue
-
- if '\n' in name:
- # Seriously?
- warn('bad name: %r' % name)
- continue
-
- # Generator; exceptions are delayed.
- templates = export_srcpkgs(data, name, srcpkgs)
- yield (name, templates)
-
-def export_all_to_directory(data, outputdir):
- def _export():
- for (name, templates) in export_all(data):
- print (name.encode('utf8') if isinstance(name, unicode) else name)
-
- try:
- # Force errors.
- templates = list(templates)
- except ExportFailure, e:
- warn('export failed: %s: %s' % (name.encode('utf-8').strip(), e.message.encode('utf-8').strip()))
-
- yield(name, templates)
-
- output_multi(outputdir, _export())
-
-def main():
- data = PkgData()
- args = sys.argv[1:]
-
- if len(args) == 0:
- export_all_to_directory(data, 'output')
- elif len(args) == 1:
- # XXX: assumes argument is an upstream name
- uname = args[0]
- srcpkgs = uname_srcpkgs(data, uname)
- templates = export_srcpkgs(data, uname, srcpkgs)
-
- for template in templates:
- print template
- else:
- raise RuntimeError()
-
-if __name__ == '__main__':
- main()