import fnmatch
from lxml import etree
import treecutter.constants as const
+import re
class Directory():
"""Class containing the state of the directory with articles"""
def __init__(self):
self._cwd = u'.'
self._tree = []
+ self._basepath = re.compile('[/\w\._-]*/\w+',re.UNICODE)
def scan(self):
for dirname, dirnames, filenames in os.walk(self._cwd):
title = doc.xpath(u'/db:article/db:info/db:title',namespaces=const.XPATH)
menu = doc.xpath(u'/db:article/db:info/db:titleabbrev',namespaces=const.XPATH)
if title and menu:
- base = file_.split('.')[1]
- link = base.replace('index','')
+ base = self._basepath.match(file_).group()
+ link = base.replace('index','')[1:]
self._tree.append(link)
def set(self):
#!/usr/bin/python
+import re
import glob
from treecutter.page import Page
self._link = link
# find the representations of the link.
self._pages = []
+ self._langregexp = re.compile('.*\.(\w\w)\.xml')
path = link
if self._link[-1] == '/':
path = path+'index'
def _scan_languages(self,path):
lang = []
- for l in glob.glob('.'+path+'*'):
- ls = l.split('.')
- if len(ls) > 3 and ls[3] == 'xml':
- lang.append((ls[2],l))
+ for l in glob.glob('.'+path+'*.xml'):
+ langcode = self._langregexp.search(l).group(1)
+ lang.append((langcode,l))
return lang
def link(self):
import codecs
import re
import shutil
+import sys
import gettext
import tempfile
from lxml import etree
# representing the text on the site.
# A link can have several pages in different languages.
def add_link(self, link):
- tokens = filter(None,re.split(r'(^/[\w:-]*$|^/[\w:-]*/|[\w:-]*/)',link,flags=re.UNICODE))
+ tokens = filter(None,re.split(r'(^/[\w\.:-]*$|^/[\w\.:-]*/|[\w\.:-]*/)',link,flags=re.UNICODE))
self._tree.add(tokens,Link(link))
def write_map(self):