Removed the initial version of tree-cutter.py after refactoring to use objects.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 import gettext
15 import shutil
16 from amara import bindery
17 from amara.xslt import transform
18 from Cheetah.Template import Template
19
20 parser = argparse.ArgumentParser(description='Process docbook article tree.')
21 parser.add_argument('--style', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/style/default/')
23 parser.add_argument('--output', nargs='?',
24                     default=os.path.dirname(os.getcwd())+'/htdocs/')
25 args = parser.parse_args()
26
27 style_xslt = args.style+"docbook.xsl"
28 outputdir = args.output
29
30 tmptarget = tempfile.mkdtemp()+'/'
31
32 valid_scripts = ['.py','.pl']
33 MAXLEVEL = 10000
34
35 def mkdir_p(path):
36     try:
37         os.makedirs(path)
38     except OSError as exc: # Python >2.5
39         if exc.errno == errno.EEXIST:
40             pass
41         else: raise
42
43 def publish(src,target):
44     cmd = ["rsync","-a","--delete",src,target]
45     retcode = subprocess.call(cmd)
46     if retcode:
47         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
48
49
50 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
51           u'xi': u'http://www.w3.org/2001/XInclude',
52           u'xl': u'http://www.w3.org/1999/xlink'}
53
54 class Directory():
55     """Class containing the state of the directory with articles"""
56     def __init__(self):
57         self._cwd = '.'
58         self._tree = []
59
60     def scan(self):
61         for dirname, dirnames, filenames in os.walk(self._cwd):
62             for filename in filenames:
63                 if fnmatch.fnmatch(filename, '*.xml'):
64                     file_ = os.path.join(dirname,filename)
65                     doc = bindery.parse(file_, prefixes=PREFIXES)
66                     title = doc.xml_select(u'/db:article/db:info/db:title')
67                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
68                     if title and menu:
69                         base = file_.split('.')[1]
70                         link = base.replace('index','')
71                         self._tree.append(link)
72
73     def set(self):
74         return set(self._tree)
75
76 class Page():
77     """Class representing a version of a webpage"""
78     def __init__(self,link,page):
79         self._link = link
80         self._file = page[1]
81         self._lang = page[0]
82         self._doc = None
83         self._resources = []
84         self._title = None
85         self._menu = None
86         self._rendered_article = None
87
88     def language(self):
89         return self._lang
90
91     def resources(self):
92         return set(self._resources)
93
94     def menu(self):
95         return self._menu
96
97     def set_article(self,art):
98         self._rendered_article = art
99
100     def prepare(self):
101         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
102         if self._doc.xml_select(u'/db:article/db:info/db:title'):
103             self._title = unicode(self._doc.article.info.title)
104         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
105             self._menu = unicode(self._doc.article.info.titleabbrev)
106
107         dirname = os.path.dirname(self._file)
108         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
109         if code:
110             for c in code:
111                 (p, ext) = os.path.splitext(c.href)
112                 if ext in valid_scripts:
113                     exe = os.path.join(os.path.abspath(dirname)+'/'+c.href)
114                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
115                     xstr = bindery.parse(str(xml.stdout.read()))
116                     idp = c.xml_index_on_parent
117                     for x in xstr.xml_children:
118                         c.xml_parent.xml_insert(idp,x)
119                         c.xml_parent.xml_remove(c)
120
121         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
122             rf = os.path.join(dirname,r.href)
123             if os.path.isfile(rf):
124                 self._resources.append(rf)
125         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
126             im = os.path.join(dirname,i.fileref)
127             if os.path.isfile(im):
128                 self._resources.append(im)
129
130     def render(self):
131         #  amara can not handle the docbook stylesheets
132         #  xmlarticle = transform(doc,style_xslt)
133         cwd = os.getcwd()
134         dirname = os.path.dirname(self._file)
135         os.chdir(dirname)
136         infile  = os.path.basename(tempfile.mktemp())
137         outfile = tempfile.mktemp()
138         tfi = open(infile,'w')
139         tfi.write(self._doc.xml_encode())
140         tfi.close()
141 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
142         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
143         retcode = subprocess.call(cmd)
144         if retcode:
145             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
146         tfo = open(outfile,'r')
147         self._rendered_article = tfo.read()
148         tfo.close()
149         os.remove(infile)
150         os.remove(outfile)
151         os.chdir(cwd)
152
153     def template(self,sitemap):
154         htmlmenu =  sitemap.gen_menu(self._lang,None,"menu")
155         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
156         langmenu = sitemap.lang_menu(self._lang,self._link)
157         template = Template(file=args.style+'index.'+self._lang+'.html.tmpl',
158                             searchList=[{'title':self._title},
159                                         {'menu':htmlmenu},
160                                         {'article':self._rendered_article},
161                                         {'levelmenu':levelmenu},
162                                         {'langmenu':langmenu}])
163         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
164         mkdir_p(os.path.dirname(outfile))
165         out = open(outfile, 'w')
166         out.write(str(template))
167         out.close()
168
169
170 class Link():
171     """Class representing a webpage on the site"""
172     def __init__(self,link):
173         self._link = link
174         # find the representations of the link.
175         self._pages = []
176         path = link
177         if self._link[-1] == '/':
178             path = path+'index'
179         lang = self._scan_languages(path)
180         for l in lang:
181             self._pages.append(Page(self,l))
182
183     def add_page(self,l):
184         self._pages.append(Page(self,l))
185
186     def _scan_languages(self,path):
187         lang = []
188         for l in  glob.glob('.'+path+'*'):
189             ls = l.split('.')
190             if len(ls) > 3 and ls[3] == 'xml':
191                 lang.append((ls[2],l))
192         return lang
193
194     def link(self):
195         return self._link
196
197     def prepare(self):
198         for page in self._pages:
199             page.prepare()
200
201     def languages(self):
202         p = []
203         for page in self._pages:
204             p.append(page.language())
205         return p
206
207     def render(self):
208         for page in self._pages:
209             page.render()
210
211     def template(self,sitemap):
212         for page in self._pages:
213             page.template(sitemap)
214
215     def page(self,lang):
216         for page in self._pages:
217             if page.language()==lang:
218                 return page
219         return None
220
221     def resources(self):
222         res  = set()
223         for page in self._pages:
224             res = res.union(page.resources())
225         return res
226
227
228 class Node():
229     def __init__(self,token,value):
230         self._token = token
231         self._value = value
232         self._children = []
233
234     def token(self):
235         return self._token
236
237     def value(self):
238         return self._value
239
240     def children(self):
241         return self._children
242
243 class Trie():
244     def __init__(self):
245         self._root = []
246
247     def __iter__(self):
248         return self.inorder(self._root)
249
250     def inorder(self,t):
251         for l in t:
252             yield l.value()
253             for x in self.inorder(l.children()):
254                 yield x
255
256     def _add(self,trie, key, content):
257         # is the key a leaf
258         k = key.pop(0)
259         if key == []:
260             node = Node(k,content)
261             trie.append(node)
262         else:
263             for ch in trie:
264                 if ch.token() == k:
265                     self._add(ch.children(), key, content)
266
267     def add(self,key, content):
268         self._add(self._root, key, content)
269
270     def _graph(self, trie, G):
271         for l in trie:
272             G.add_node(l.token())
273             for ch in l.children():
274                 G.add_edge(l.token(),ch.token())
275                 self._graph(l.children(), G)
276
277     def graph(self):
278         G = pgv.AGraph(directed=True)
279         G.add_node("sitemap")
280         for ch in self._root:
281             G.add_edge("sitemap",ch.token())
282         self._graph(self._root, G)
283 #        G.layout('dot')
284 #        G.draw('g.png')
285 #        print G.string()
286
287     def _menu(self, trie, lang, page, css):
288         html = "<ul%s>\n" % css
289         for l in trie:
290             sel = ''
291             p = l.value().page(lang)
292             if p == page:
293                 sel = ' class="selected"'
294             if p != None:
295                 html += '<li%s><a href="%s">%s</a>\n' \
296                     % (sel,l.value().link(),p.menu())
297             else:
298                 html += '<li%s><a href="%s.en" hreflang="en">%s</a>*\n' \
299                     % (sel,l.value().link(), l.value().page('en').menu())
300             if l.children():
301                 html += self._menu(l.children(), lang, page, "")
302         html += "</ul>\n"
303         return html
304
305     def menu(self,lang,page,cssclass):
306         css = ''
307         if cssclass:
308             css = ' class="'+cssclass+'"'
309         return self._menu(self._root, lang, page, css)
310
311 class Sitemap():
312     """Class keeping the internal site structure"""
313     def __init__(self):
314         self._file = 'sitemap.txt'
315         self._tree = Trie()
316         self._sitelang = set()
317         self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
318         self._tranlang = {}
319
320     def add_link(self, link):
321         tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link))
322         self._tree.add(tokens,Link(link))
323
324     def write_map(self):
325         f = open(self._file,'w')
326         f.write('\n'.join(link.link() for link in self._tree))
327         f.close()
328
329     def read_map(self):
330         try:
331             f = open(self._file)
332             sml = f.read().split()
333             f.close()
334             for line in sml:
335                 self.add_link(line)
336         except IOError, what_error:
337             print 'INFO: Could not read sitemap.txt - one will be created'
338
339     def set(self):
340         return set(link.link() for link in self._tree)
341
342     def process(self):
343         t1 = time.time()
344         for link in self._tree:
345             link.prepare()
346         t2 = time.time()
347         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
348         for link in self._tree:
349             self._sitelang = self._sitelang.union(set(link.languages()))
350         for tran in self._sitelang:
351             if tran != 'en':
352                 self._tranlang[tran] = gettext.translation('iso_639_3',
353                                                            languages=[tran])
354         t3 = time.time()
355         print "Language [%5.2f s]" % (round(t3-t2,2))
356         for link in self._tree:
357             link.render()
358         t4 = time.time()
359         print "Render   [%5.2f s]" % (round(t4-t3,2))
360         for link in self._tree:
361             link.template(self)
362         t5 = time.time()
363         print "Template [%5.2f s]" % (round(t5-t4,2))
364         t6 = time.time()
365         res = set()
366         cwd = os.getcwd()
367         for link in self._tree:
368             res = res.union(link.resources())
369         for f in res:
370             outfile = tmptarget+f
371             mkdir_p(os.path.dirname(outfile))
372             shutil.copyfile(f,outfile)
373         print "Resources[%5.2f s]" % (round(t6-t5,2))
374         sitmaplink = Link('/sitemap')
375         for l in self._sitelang:
376             sitmaplink.add_page((l,'/sitemap.'+l+'.xml'))
377         for l in self._sitelang:
378             sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap"))
379             sitmaplink.page(l).template(self)
380         t7 = time.time()
381         print "Sitemap  [%5.2f s]" % (round(t7-t6,2))
382
383     def graph(self):
384         self._tree.graph()
385
386     def gen_menu(self,lang,page,cssclass):
387         return self._tree.menu(lang,page,cssclass)
388
389     def lang_menu(self,lang,link):
390         html = "<ul>"
391         for l in link.languages():
392             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
393             ln = self._isocode.xml_select(isoxml)[0].name
394             if lang != 'en':
395                 ln = self._tranlang[lang].gettext(ln)
396             p = link.link()
397             if p[-1] == '/':
398                 p = p +'index'
399             p = p+'.'+l
400             html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
401         html += "</ul>"
402         return html
403
404     def publish(self):
405         publish(tmptarget, args.output)
406         publish(args.style+"css", args.output)
407         publish(args.style+"images",args.output)
408
409 ts = time.time()
410 dir_ = Directory()
411 sitemap = Sitemap()
412
413 dir_.scan()
414 sitemap.read_map()
415
416 missing = dir_.set() - sitemap.set()
417 removed = sitemap.set() - dir_.set()
418 for page in removed:
419     print page+' pages missing!!'
420 for page in missing:
421     print 'adding missing page '+page
422     sitemap.add_link(page)
423 if len(missing)+len(removed) != 0:
424     print 'writing new sitemap - please adjust if needed'
425     sitemap.write_map()
426 sitemap.graph()
427
428 sitemap.process()
429
430 t1 = time.time()
431 sitemap.publish()
432 t2 = time.time()
433 print "Publish  [%5.2f s]" % (round(t2-t1,2))
434 print "Total    [%5.2f s]" % (round(t2-ts,2))