Adding shutil for the copying.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 import gettext
15 import shutil
16 from amara import bindery
17 from amara.xslt import transform
18 from Cheetah.Template import Template
19
20 parser = argparse.ArgumentParser(description='Process docbook article tree.')
21 parser.add_argument('--style', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/style/default/')
23 parser.add_argument('--output', nargs='?',
24                     default=os.path.dirname(os.getcwd())+'/htdocs/')
25 args = parser.parse_args()
26
27 style_xslt = args.style+"docbook.xsl"
28 style_tmpl = args.style+"index.en.html.tmpl"
29 outputdir = args.output
30
31 tmptarget = tempfile.mkdtemp()+'/'
32
33 valid_scripts = ['.py','.pl']
34 MAXLEVEL = 10000
35
36 def mkdir_p(path):
37     try:
38         os.makedirs(path)
39     except OSError as exc: # Python >2.5
40         if exc.errno == errno.EEXIST:
41             pass
42         else: raise
43
44 def publish(src,target):
45     cmd = ["rsync","-a","--delete",src,target]
46     retcode = subprocess.call(cmd)
47     if retcode:
48         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
49
50
51 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
52           u'xi': u'http://www.w3.org/2001/XInclude',
53           u'xl': u'http://www.w3.org/1999/xlink'}
54
55 class Directory():
56     """Class containing the state of the directory with articles"""
57     def __init__(self):
58         self._cwd = '.'
59         self._tree = []
60
61     def scan(self):
62         for dirname, dirnames, filenames in os.walk(self._cwd):
63             for filename in filenames:
64                 if fnmatch.fnmatch(filename, '*.xml'):
65                     file_ = os.path.join(dirname,filename)
66                     doc = bindery.parse(file_, prefixes=PREFIXES)
67                     title = doc.xml_select(u'/db:article/db:info/db:title')
68                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
69                     if title and menu:
70                         base = file_.split('.')[1]
71                         link = base.replace('index','')
72                         self._tree.append(link)
73
74     def set(self):
75         return set(self._tree)
76
77 class Page():
78     """Class representing a version of a webpage"""
79     def __init__(self,link,page):
80         self._link = link
81         self._file = page[1]
82         self._lang = page[0]
83         self._doc = None
84         self._resources = []
85         self._title = None
86         self._menu = None
87         self._rendered_article = None
88
89     def language(self):
90         return self._lang
91
92     def resources(self):
93         return set(self._resources)
94
95     def menu(self):
96         return self._menu
97
98     def set_article(self,art):
99         self._rendered_article = art
100
101     def prepare(self):
102         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
103         if self._doc.xml_select(u'/db:article/db:info/db:title'):
104             self._title = unicode(self._doc.article.info.title)
105         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
106             self._menu = unicode(self._doc.article.info.titleabbrev)
107
108         dirname = os.path.dirname(self._file)
109         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
110         if code:
111             for c in code:
112                 (p, ext) = os.path.splitext(c.href)
113                 if ext in valid_scripts:
114                     exe = os.path.join(os.path.abspath(dirname)+'/'+c.href)
115                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
116                     xstr = bindery.parse(str(xml.stdout.read()))
117                     idp = c.xml_index_on_parent
118                     for x in xstr.xml_children:
119                         c.xml_parent.xml_insert(idp,x)
120                         c.xml_parent.xml_remove(c)
121
122         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
123             rf = os.path.join(dirname,r.href)
124             if os.path.isfile(rf):
125                 self._resources.append(rf)
126         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
127             im = os.path.join(dirname,i.fileref)
128             if os.path.isfile(im):
129                 self._resources.append(im)
130
131     def render(self):
132         #  amara can not handle the docbook stylesheets
133         #  xmlarticle = transform(doc,style_xslt)
134         cwd = os.getcwd()
135         dirname = os.path.dirname(self._file)
136         os.chdir(dirname)
137         infile  = os.path.basename(tempfile.mktemp())
138         outfile = tempfile.mktemp()
139         tfi = open(infile,'w')
140         tfi.write(self._doc.xml_encode())
141         tfi.close()
142 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
143         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
144         retcode = subprocess.call(cmd)
145         if retcode:
146             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
147         tfo = open(outfile,'r')
148         self._rendered_article = tfo.read()
149         tfo.close()
150         os.remove(infile)
151         os.remove(outfile)
152         os.chdir(cwd)
153
154     def template(self,sitemap):
155         htmlmenu =  sitemap.gen_menu(self._lang,None,"menu")
156         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
157         template = Template(file=style_tmpl,
158         langmenu = sitemap.lang_menu(self._lang,self._link)
159                             searchList=[{'title':self._title},
160                                         {'menu':htmlmenu},
161                                         {'article':self._rendered_article},
162                                         {'levelmenu':levelmenu},
163                                         {'langmenu':langmenu}])
164         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
165         mkdir_p(os.path.dirname(outfile))
166         out = open(outfile, 'w')
167         out.write(str(template))
168         out.close()
169
170
171 class Link():
172     """Class representing a webpage on the site"""
173     def __init__(self,link):
174         self._link = link
175         # find the representations of the link.
176         self._pages = []
177         path = link
178         if self._link[-1] == '/':
179             path = path+'index'
180         lang = self._scan_languages(path)
181         for l in lang:
182             self._pages.append(Page(self,l))
183
184     def add_page(self,l):
185         self._pages.append(Page(self,l))
186
187     def _scan_languages(self,path):
188         lang = []
189         for l in  glob.glob('.'+path+'*'):
190             ls = l.split('.')
191             if len(ls) > 3 and ls[3] == 'xml':
192                 lang.append((ls[2],l))
193         return lang
194
195     def link(self):
196         return self._link
197
198     def prepare(self):
199         for page in self._pages:
200             page.prepare()
201
202     def languages(self):
203         p = []
204         for page in self._pages:
205             p.append(page.language())
206         return p
207
208     def render(self):
209         for page in self._pages:
210             page.render()
211
212     def template(self,sitemap):
213         for page in self._pages:
214             page.template(sitemap)
215
216     def page(self,lang):
217         for page in self._pages:
218             if page.language()==lang:
219                 return page
220         return None
221
222     def resources(self):
223         res  = set()
224         for page in self._pages:
225             res = res.union(page.resources())
226         return res
227
228
229 class Node():
230     def __init__(self,token,value):
231         self._token = token
232         self._value = value
233         self._children = []
234
235     def token(self):
236         return self._token
237
238     def value(self):
239         return self._value
240
241     def children(self):
242         return self._children
243
244 class Trie():
245     def __init__(self):
246         self._root = []
247
248     def __iter__(self):
249         return self.inorder(self._root)
250
251     def inorder(self,t):
252         for l in t:
253             yield l.value()
254             for x in self.inorder(l.children()):
255                 yield x
256
257     def _add(self,trie, key, content):
258         # is the key a leaf
259         k = key.pop(0)
260         if key == []:
261             node = Node(k,content)
262             trie.append(node)
263         else:
264             for ch in trie:
265                 if ch.token() == k:
266                     self._add(ch.children(), key, content)
267
268     def add(self,key, content):
269         self._add(self._root, key, content)
270
271     def _graph(self, trie, G):
272         for l in trie:
273             G.add_node(l.token())
274             for ch in l.children():
275                 G.add_edge(l.token(),ch.token())
276                 self._graph(l.children(), G)
277
278     def graph(self):
279         G = pgv.AGraph(directed=True)
280         G.add_node("sitemap")
281         for ch in self._root:
282             G.add_edge("sitemap",ch.token())
283         self._graph(self._root, G)
284 #        G.layout('dot')
285 #        G.draw('g.png')
286 #        print G.string()
287
288     def _menu(self, trie, lang, page, css):
289         html = "<ul%s>\n" % css
290         for l in trie:
291             sel = ''
292             p = l.value().page(lang)
293             if p == page:
294                 sel = ' class="selected"'
295             if p != None:
296                 html += '<li%s><a href="%s">%s</a>\n' \
297                     % (sel,l.value().link(),p.menu())
298             else:
299                 html += '<li%s><a href="%s.en" hreflang="en">%s</a>*\n' \
300                     % (sel,l.value().link(), l.value().page('en').menu())
301             if l.children():
302                 html += self._menu(l.children(), lang, page, "")
303         html += "</ul>\n"
304         return html
305
306     def menu(self,lang,page,cssclass):
307         css = ''
308         if cssclass:
309             css = ' class="'+cssclass+'"'
310         return self._menu(self._root, lang, page, css)
311
312 class Sitemap():
313     """Class keeping the internal site structure"""
314     def __init__(self):
315         self._file = 'sitemap.txt'
316         self._tree = Trie()
317         self._sitelang = set()
318         self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
319         self._tranlang = {}
320
321     def add_link(self, link):
322         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
323         self._tree.add(tokens,Link(link))
324
325     def write_map(self):
326         f = open(self._file,'w')
327         f.write('\n'.join(link.link() for link in self._tree))
328         f.close()
329
330     def read_map(self):
331         try:
332             f = open(self._file)
333             sml = f.read().split()
334             f.close()
335             for line in sml:
336                 self.add_link(line)
337         except IOError, what_error:
338             print 'INFO: Could not read sitemap.txt - one will be created'
339
340     def set(self):
341         return set(link.link() for link in self._tree)
342
343     def process(self):
344         t1 = time.time()
345         for link in self._tree:
346             link.prepare()
347         t2 = time.time()
348         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
349         for link in self._tree:
350             self._sitelang = self._sitelang.union(set(link.languages()))
351         for tran in self._sitelang:
352             if tran != 'en':
353                 self._tranlang[tran] = gettext.translation('iso_639_3', languages=[tran])
354         t3 = time.time()
355         print "Language [%5.2f s]" % (round(t3-t2,2))
356         for link in self._tree:
357             link.render()
358         t4 = time.time()
359         print "Render   [%5.2f s]" % (round(t4-t3,2))
360         for link in self._tree:
361             link.template(self)
362         t5 = time.time()
363         print "Template [%5.2f s]" % (round(t5-t4,2))
364         t6 = time.time()
365         res = set()
366         cwd = os.getcwd()
367         for link in self._tree:
368             res = res.union(link.resources())
369         for f in res:
370             outfile = tmptarget+f
371             mkdir_p(os.path.dirname(outfile))
372             shutil.copyfile(f,outfile)
373         print "Resources[%5.2f s]" % (round(t6-t5,2))
374         sitmaplink = Link('/sitemap')
375         for l in self._sitelang:
376             sitmaplink.add_page((l,'/sitemap.'+l+'.xml'))
377         for l in self._sitelang:
378             sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap"))
379             sitmaplink.page(l).template(self)
380         t7 = time.time()
381         print "Sitemap  [%5.2f s]" % (round(t7-t6,2))
382
383     def graph(self):
384         self._tree.graph()
385
386     def gen_menu(self,lang,page,cssclass):
387         return self._tree.menu(lang,page,cssclass)
388
389     def lang_menu(self,lang,link):
390         html = "<ul>"
391         for l in link.languages():
392             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
393             ln = self._isocode.xml_select(isoxml)[0].name
394             if lang != 'en':
395                 ln = self._tranlang[lang].gettext(ln)
396             p = link.link()
397             if p[-1] == '/':
398                 p = p +'index'
399             p = p+'.'+l
400             html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
401         html += "</ul>"
402         return html
403
404     def publish(self):
405         publish(tmptarget, args.output)
406         publish(args.style+"css", args.output)
407         publish(args.style+"images",args.output)
408
409 def generateSitemap():
410     sitemap = []
411     try:
412         sfile = open('sitemap.txt')
413         flist = sfile.read().split()
414         sfile.close()
415         for f in flist:
416             sitemap.append(dict(link=f))
417     except IOError, what_error:
418         print 'Sitemap missing - generating one.'
419
420     for dirname, dirnames, filenames in os.walk('.'):
421         for filename in filenames:
422             if fnmatch.fnmatch(filename, '*.xml'):
423                 xfile = os.path.join(dirname,filename)
424                 doc = bindery.parse(xfile,
425                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
426                                               u'xi': u'http://www.w3.org/2001/XInclude',
427                                               u'xl': u'http://www.w3.org/1999/xlink'})
428                 title = doc.xml_select(u'/db:article/db:info/db:title')
429                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
430                 code  = doc.xml_select(u"//xi:include[@parse='text']")
431                 resource = doc.xml_select(u"//db:link[@xl:href]")
432                 image = doc.xml_select(u"//db:imagedata[@fileref]")
433                 exe = 0
434                 for c in code:
435                     (p, ext) = os.path.splitext(c.href)
436                     if ext in valid_scripts:
437                         exe = 1
438
439                 if title and menu:
440                     found = 0
441                     base = xfile.split('.')[1]
442                     link = base.replace('index','')
443                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
444                     res = []
445                     for r in resource:
446                         rf = os.path.join(dirname,r.href)
447                         if os.path.isfile(rf):
448                             res.append(rf)
449                     for i in image:
450                         im = os.path.join(dirname,i.fileref)
451                         if os.path.isfile(im):
452                             res.append(im)
453                     page = dict(title=unicode(doc.article.info.title),
454                                 menu=unicode(doc.article.info.titleabbrev),
455                                 output=os.path.join(dirname,
456                                                     filename.replace('xml','html')),
457                                 exe=exe,
458                                 file=xfile,
459                                 res=res,
460                                 level=level)
461                     for l in sitemap:
462                         if l['link'] == link:
463                             found = 1
464                             l.update(page)
465                     if not found:
466                         print "adding "+link+" to sitemap"
467                         dd = dict(link=link)
468                         dd.update(page)
469                         sitemap.append(dd)
470     sfile = open('sitemap.txt','w')
471     for l in sitemap:
472         sfile.write(l['link']+'\n')
473     sfile.close()
474     return sitemap
475
476 def expandXincludeTxt(page):
477     doc = bindery.parse(page['file'],
478                         prefixes={u'db': u'http://docbook.org/ns/docbook',
479                                   u'xi': u'http://www.w3.org/2001/XInclude'})
480     if page['exe']:
481         code  = doc.xml_select(u"//xi:include[@parse='text']")
482         for c in code:
483             (p, ext) = os.path.splitext(c.href)
484             if ext in valid_scripts:
485                 exe = os.path.join(os.path.abspath(c.href))
486                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
487                 xstr = bindery.parse(str(xml.stdout.read()))
488                 id = c.xml_index_on_parent
489                 for x in xstr.xml_children:
490                     c.xml_parent.xml_insert(id,x)
491                 c.xml_parent.xml_remove(c)
492     return doc
493
494 def xsltConvert(doc):
495 #  amara can not handle the docbook stylesheets
496 #  xmlarticle = transform(doc,style_xslt)
497     cwd = os.getcwd()
498     rundir = os.path.dirname(page['file'])
499     os.chdir(rundir)
500     infile  = os.path.basename(tempfile.mktemp())
501     outfile = tempfile.mktemp()
502     tfi = open(infile,'w')
503     tfi.write(doc.xml_encode())
504     tfi.close()
505 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
506     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
507     retcode = subprocess.call(cmd)
508     if retcode:
509         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
510     tfo = open(outfile,'r')
511     result = tfo.read()
512     tfo.close()
513     os.remove(infile)
514     os.remove(outfile)
515     os.chdir(cwd)
516     return result
517
518 def genMenu(page,sitemap,slevel,elevel):
519     title = None
520     sm = []
521     if elevel == MAXLEVEL or elevel == 1 or page == None:
522         html = '<ul>\n'
523         sm = sitemap
524     else:
525         html = '<ul class="tree">\n'
526         idx = sitemap.index(page)
527         while (sitemap[idx]['level'] == page['level']):
528             idx = idx-1
529         title = sitemap[idx]['menu']
530         idx = idx+1
531         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
532             sm.append(sitemap[idx])
533             idx = idx+1
534     oldlevel = slevel
535
536     for p in sm:
537         if slevel > p['level'] or elevel < p['level']:
538             continue
539         if not title and p['link'] == '/':
540             title = p['menu']
541
542         if oldlevel < p['level']:
543             html+='<ul>\n'
544         elif oldlevel > p['level']:
545             if p['link'][-1] == '/':
546                 html+='</li>\n'
547             html+='</ul>\n</li>\n'
548         if page != None and page == p:
549             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
550         else:
551             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
552         if p['link'][-1] != '/' or p['link'] == '/':
553             html+='</li>\n'
554         oldlevel = p['level']
555     html+='</ul>\n'
556     return (html,title)
557
558 def writeToTemplate(page,doc,sitemap):
559     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
560     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
561     template = Template(file=style_tmpl,
562                         searchList=[{'title':page['title']},
563                                     {'menu':menu},
564                                     {'article':doc},
565                                     {'levelmenu':levelmenu},
566                                     {'levelname':levelname}])
567     outfile = tmptarget+page['output']
568     mkdir_p(os.path.dirname(outfile))
569     out = open(outfile, 'w')
570     out.write(str(template))
571     out.close()
572     for r in page['res']:
573         mkdir_p(os.path.dirname(tmptarget+r))
574         shutil.copyfile(r, tmptarget+r)
575
576 def createSitemap(sitemap):
577     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
578     template = Template(file=style_tmpl,
579                         searchList=[
580             {'title':'Sitemap'},
581             {'menu':menu},
582             {'article':menu},
583             {'levelmenu':''},
584             {'levelname':''}])
585     outfile = tmptarget+'sitemap.en.html'
586     mkdir_p(os.path.dirname(outfile))
587     out = open(outfile, 'w')
588     out.write(str(template))
589     out.close()
590
591 dir_ = Directory()
592 sitemap = Sitemap()
593
594 dir_.scan()
595 sitemap.read_map()
596
597 missing = dir_.set() - sitemap.set()
598 removed = sitemap.set() - dir_.set()
599 for page in removed:
600     print page+' pages missing!!'
601 for page in missing:
602     print 'adding missing page '+page
603     sitemap.add_link(page)
604 if len(missing & removed) != 0:
605     print 'writing new sitemap - please adjust if needed'
606     sitemap.write_map()
607 sitemap.graph()
608
609 sitemap.process()
610
611 t1 = time.time()
612 sitemap.publish()
613 t2 = time.time()
614 print "Publish  [%5.2f s]" % (round(t2-t1,2))
615
616 sitemap = generateSitemap()
617 tmptarget = tempfile.mkdtemp()+'/'
618 tot = 0
619 for page in sitemap:
620     t1 = time.time()
621     print "Page : %-30s %30s" % (page['link'],
622                         time.ctime(os.stat(page['file']).st_mtime)),
623     doc = expandXincludeTxt(page)
624     pubdoc = xsltConvert(doc)
625     writeToTemplate(page,pubdoc,sitemap)
626     t2 = time.time()
627     print "[%5.2f s]" % (round(t2-t1,2))
628     tot = tot + (t2-t1)
629
630 print "Total time\t\t\t\t\t\t\t     [%5.2f s]" % (round(tot,2))
631 createSitemap(sitemap)
632 publish(tmptarget, args.output)
633 publish(args.style+"css", args.output)
634 publish(args.style+"images",args.output)