From 870b74488439f6c30974eb759d5032fc06cb8c14 Mon Sep 17 00:00:00 2001 From: Fredrik Unger Date: Fri, 20 Feb 2015 15:51:58 +0100 Subject: [PATCH] path: adding support for . in directoies This was done as directories with : were not allowed in Windows, and switching to . caused problems with the links relying on the index.langcode.xml format. --- treecutter/directory.py | 6 ++++-- treecutter/link.py | 9 +++++---- treecutter/sitemap.py | 3 ++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/treecutter/directory.py b/treecutter/directory.py index 7519a6b..e7508d9 100644 --- a/treecutter/directory.py +++ b/treecutter/directory.py @@ -3,12 +3,14 @@ import os import fnmatch from lxml import etree import treecutter.constants as const +import re class Directory(): """Class containing the state of the directory with articles""" def __init__(self): self._cwd = u'.' self._tree = [] + self._basepath = re.compile('[/\w\._-]*/\w+',re.UNICODE) def scan(self): for dirname, dirnames, filenames in os.walk(self._cwd): @@ -19,8 +21,8 @@ class Directory(): title = doc.xpath(u'/db:article/db:info/db:title',namespaces=const.XPATH) menu = doc.xpath(u'/db:article/db:info/db:titleabbrev',namespaces=const.XPATH) if title and menu: - base = file_.split('.')[1] - link = base.replace('index','') + base = self._basepath.match(file_).group() + link = base.replace('index','')[1:] self._tree.append(link) def set(self): diff --git a/treecutter/link.py b/treecutter/link.py index 69ee9b1..f482c0b 100644 --- a/treecutter/link.py +++ b/treecutter/link.py @@ -1,4 +1,5 @@ #!/usr/bin/python +import re import glob from treecutter.page import Page @@ -8,6 +9,7 @@ class Link(): self._link = link # find the representations of the link. self._pages = [] + self._langregexp = re.compile('.*\.(\w\w)\.xml') path = link if self._link[-1] == '/': path = path+'index' @@ -20,10 +22,9 @@ class Link(): def _scan_languages(self,path): lang = [] - for l in glob.glob('.'+path+'*'): - ls = l.split('.') - if len(ls) > 3 and ls[3] == 'xml': - lang.append((ls[2],l)) + for l in glob.glob('.'+path+'*.xml'): + langcode = self._langregexp.search(l).group(1) + lang.append((langcode,l)) return lang def link(self): diff --git a/treecutter/sitemap.py b/treecutter/sitemap.py index 7ad9446..919e8a4 100644 --- a/treecutter/sitemap.py +++ b/treecutter/sitemap.py @@ -3,6 +3,7 @@ import os import codecs import re import shutil +import sys import gettext import tempfile from lxml import etree @@ -31,7 +32,7 @@ class Sitemap(): # representing the text on the site. # A link can have several pages in different languages. def add_link(self, link): - tokens = filter(None,re.split(r'(^/[\w:-]*$|^/[\w:-]*/|[\w:-]*/)',link,flags=re.UNICODE)) + tokens = filter(None,re.split(r'(^/[\w\.:-]*$|^/[\w\.:-]*/|[\w\.:-]*/)',link,flags=re.UNICODE)) self._tree.add(tokens,Link(link)) def write_map(self): -- 2.30.2