From 3dc769bd4237b439c32b1dc4ad128e7cda480a15 Mon Sep 17 00:00:00 2001 From: Ruben Rodriguez Date: Wed, 1 Jul 2015 14:51:05 -0500 Subject: Parse the changelog files and extract the uploader name and date. The load_changelog.py script parses the changelog files in the same way as load_copyright.py and stores the results (package name, version, author/uploader and date) in cl.h5 That information is later used by export.py to fill up the 'License verified by' and 'License verified date' fields with more accurate information than 'Debian' and today() --- export.py | 15 +++++++++++---- load_changelog.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) create mode 100644 load_changelog.py diff --git a/export.py b/export.py index 5f358cb..8bd72bd 100644 --- a/export.py +++ b/export.py @@ -60,6 +60,10 @@ class PkgData(object): self.licenses = cp_store['licenses'] cp_store.close() + cl_store = pd.HDFStore('cl.h5') + self.cl = cl_store['cl_versions'] + cl_store.close() + def nouni(s): return s.encode('utf8') if isinstance(s, unicode) else s @@ -122,7 +126,7 @@ def get_license_map(): return map -def srcpkg_extract_licenses(header, filess, licenses): +def srcpkg_extract_licenses(header, filess, licenses, cl_date, cl_uploader): # XXX: generate template from header stanza # XXX: flag CC licenses # XXX: check all License stanzas were included @@ -174,8 +178,8 @@ def srcpkg_extract_licenses(header, filess, licenses): yield Template('Project license', [ ('License', canon), ('License copyright', cp), - ('License verified by', 'Debian'), - ('License verified date', today()), + ('License verified by', 'Debian: %s' % cl_uploader), + ('License verified date', cl_date), ('License note', txt)]) def parse_person(s): @@ -284,8 +288,11 @@ def export_srcpkgs(data, name, srcpkg_names): people.extend(list(extract_people(pkg_cps))) res.extend(list(extract_resources(pkg_cps))) + pkg_cl = data.cl[data.cl['_srcpkg'] == srcpkg] + cl_date = pkg_cl['date'][0] + cl_uploader = pkg_cl['author'][0] for template in srcpkg_extract_licenses( - pkg_cps, pkg_cpf, pkg_licenses): + pkg_cps, pkg_cpf, pkg_licenses, cl_date, cl_uploader): # XXX: eliminate duplicates yield template diff --git a/load_changelog.py b/load_changelog.py new file mode 100644 index 0000000..448c087 --- /dev/null +++ b/load_changelog.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from debian import changelog +import pandas as pd +import os +import sys +from dateutil import parser + +def import_one(pkgname, fh): + try: + c = changelog.Changelog(fh) + date = parser.parse(c.date).strftime('%Y-%m-%d') + df = pd.DataFrame([{'_srcpkg':c.package, 'version':c.version, 'date':date, 'author':c.author}]) + except: + return + return (df) + +def get_pkgname(path): + (dir, base) = os.path.split(path) + + if base in ('current', 'changelog.txt'): + return get_pkgname(dir) + else: + return base + +def main(paths): + versions = [] + + for path in paths: + pkgname = get_pkgname(path) + print pkgname, path + data = import_one(pkgname, file(path)) + + if data is not None: + versions.append(data) + + versions = pd.concat(versions) + print versions + store = pd.HDFStore('cl.h5') + store['cl_versions'] = versions + store.close() + +if __name__ == '__main__': + main(sys.argv[1:]) + -- cgit v1.2.3