Added gen_menu stub in sitemap that will generate menues from a prepared sitemap.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 valid_scripts = ['.py','.pl']
30 MAXLEVEL = 10000
31
32 def mkdir_p(path):
33     try:
34         os.makedirs(path)
35     except OSError as exc: # Python >2.5
36         if exc.errno == errno.EEXIST:
37             pass
38         else: raise
39
40 def publish(src,target):
41     cmd = ["rsync","-a","--delete",src,target]
42     retcode = subprocess.call(cmd)
43     if retcode:
44         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
45
46
47 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
48           u'xi': u'http://www.w3.org/2001/XInclude',
49           u'xl': u'http://www.w3.org/1999/xlink'}
50
51 class Directory():
52     """Class containing the state of the directory with articles"""
53     def __init__(self):
54         self._cwd = '.'
55         self._tree = []
56
57     def scan(self):
58         for dirname, dirnames, filenames in os.walk(self._cwd):
59             for filename in filenames:
60                 if fnmatch.fnmatch(filename, '*.xml'):
61                     file_ = os.path.join(dirname,filename)
62                     doc = bindery.parse(file_, prefixes=PREFIXES)
63                     title = doc.xml_select(u'/db:article/db:info/db:title')
64                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
65                     if title and menu:
66                         base = file_.split('.')[1]
67                         link = base.replace('index','')
68                         self._tree.append(link)
69
70     def set(self):
71         return set(self._tree)
72
73 class Page():
74     """Class representing a version of a webpage"""
75     def __init__(self,page):
76         self._file = page[1]
77         self._lang = page[0]
78         self._doc = None
79         self._resources = []
80         self._title = None
81         self._menu = None
82         self._rendered_article = None
83
84     def prepare(self):
85         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
86         if self._doc.xml_select(u'/db:article/db:info/db:title'):
87             self._title = unicode(doc.article.info.title)
88         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
89             self._menu = unicode(doc.article.info.titleabbrev)
90
91         dirname = os.path.dirname(self._file)
92         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
93         if code:
94             for c in code:
95                 (p, ext) = os.path.splitext(c.href)
96                 if ext in valid_scripts:
97                     exe = os.path.join(os.path.abspath(dirname+c.href))
98                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
99                     xstr = bindery.parse(str(xml.stdout.read()))
100                     idp = c.xml_index_on_parent
101                     for x in xstr.xml_children:
102                         c.xml_parent.xml_insert(idp,x)
103                         c.xml_parent.xml_remove(c)
104
105         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
106             rf = os.path.join(dirname,r.href)
107             if os.path.isfile(rf):
108                 self._resources.append(rf)
109         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
110             im = os.path.join(dirname,i.fileref)
111             if os.path.isfile(im):
112                 self._resources.append(im)
113
114     def render(self):
115         #  amara can not handle the docbook stylesheets
116         #  xmlarticle = transform(doc,style_xslt)
117         cwd = os.getcwd()
118         dirname = os.path.dirname(self._file)
119         os.chdir(dirname)
120         infile  = os.path.basename(tempfile.mktemp())
121         outfile = tempfile.mktemp()
122         tfi = open(infile,'w')
123         tfi.write(doc.xml_encode())
124         tfi.close()
125 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
126         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
127         retcode = subprocess.call(cmd)
128         if retcode:
129             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
130         tfo = open(outfile,'r')
131         self._rendered_article = tfo.read()
132         tfo.close()
133         os.remove(infile)
134         os.remove(outfile)
135         os.chdir(cwd)
136
137     def template(self,sitemap):
138         htmlmenu =  sitemap.gen_menu(self._lang,None)
139         levelmenu = sitemap.gen_menu(self._lang,self)
140         template = Template(file=style_tmpl,
141                             searchList=[{'title':self._title},
142                                         {'menu':htmlmenu},
143                                         {'article':self._rendered_article},
144                                         {'levelmenu':levelmenu},
145                                         {'levelname':levelname}])
146         outfile = tmptarget+self._file+'.'+self._lang+'.html'
147         mkdir_p(os.path.dirname(outfile))
148         out = open(outfile, 'w')
149         out.write(str(template))
150         out.close()
151
152
153 class Link():
154     """Class representing a webpage on the site"""
155     def __init__(self,link):
156         self._link = link
157         # find the representations of the link.
158         self._pages = []
159         path = link
160         if self._link[-1] == '/':
161             path = path+'index'
162         lang = self._scan_languages(path)
163         for l in lang:
164             self._pages.append(Page(l))
165
166     def _scan_languages(self,path):
167         lang = []
168         for l in  glob.glob('.'+path+'*'):
169             ls = l.split('.')
170             if len(ls) > 3 and ls[3] == 'xml':
171                 lang.append((ls[2],l))
172         return lang
173
174     def link(self):
175         return self._link
176
177 class Node():
178     def __init__(self,token,value):
179         self._token = token
180         self._value = value
181         self._children = []
182
183     def token(self):
184         return self._token
185
186     def value(self):
187         return self._value
188
189     def children(self):
190         return self._children
191
192 class Trie():
193     def __init__(self):
194         self._root = []
195
196     def _add(self,trie, key, content):
197         # is the key a leaf
198         k = key.pop(0)
199         if key == []:
200             node = Node(k,content)
201             trie.append(node)
202         else:
203             for ch in trie:
204                 if ch.token() == k:
205                     self._add(ch.children(), key, content)
206
207     def add(self,key, content):
208         self._add(self._root, key, content)
209
210     def _graph(self, trie, G):
211         for l in trie:
212             G.add_node(l.token())
213             for ch in l.children():
214                 G.add_edge(l.token(),ch.token())
215                 self._graph(l.children(), G)
216
217     def graph(self):
218         G = pgv.AGraph(directed=True)
219         G.add_node("sitemap")
220         for ch in self._root:
221             G.add_edge("sitemap",ch.token())
222         self._graph(self._root, G)
223 #        G.layout('dot')
224 #        G.draw('g.png')
225         print G.string()
226
227 class Sitemap():
228     """Class keeping the internal site structure"""
229     def __init__(self):
230         self._file = 'sitemap.txt'
231         self._tree = Trie()
232
233     def add_link(self, link):
234         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
235         self._tree.add(tokens,Link(link))
236
237     def read_map(self):
238         try:
239             f = open(self._file)
240             sml = f.read().split()
241             f.close()
242             for line in sml:
243                 self.add_link(line)
244         except IOError, what_error:
245             print 'INFO: Could not read sitemap.txt - one will be created'
246
247     def set(self):
248         return set(link.link() for link in self._tree)
249
250     def graph(self):
251         self._tree.graph()
252
253     def gen_menu(self,lang,page):
254         return 'Generate menu from sitemap - To be implemented'
255
256 def generateSitemap():
257     sitemap = []
258     try:
259         sfile = open('sitemap.txt')
260         flist = sfile.read().split()
261         sfile.close()
262         for f in flist:
263             sitemap.append(dict(link=f))
264     except IOError, what_error:
265         print 'Sitemap missing - generating one.'
266
267     for dirname, dirnames, filenames in os.walk('.'):
268         for filename in filenames:
269             if fnmatch.fnmatch(filename, '*.xml'):
270                 xfile = os.path.join(dirname,filename)
271                 doc = bindery.parse(xfile,
272                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
273                                               u'xi': u'http://www.w3.org/2001/XInclude',
274                                               u'xl': u'http://www.w3.org/1999/xlink'})
275                 title = doc.xml_select(u'/db:article/db:info/db:title')
276                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
277                 code  = doc.xml_select(u"//xi:include[@parse='text']")
278                 resource = doc.xml_select(u"//db:link[@xl:href]")
279                 image = doc.xml_select(u"//db:imagedata[@fileref]")
280                 exe = 0
281                 for c in code:
282                     (p, ext) = os.path.splitext(c.href)
283                     if ext in valid_scripts:
284                         exe = 1
285
286                 if title and menu:
287                     found = 0
288                     base = xfile.split('.')[1]
289                     link = base.replace('index','')
290                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
291                     res = []
292                     for r in resource:
293                         rf = os.path.join(dirname,r.href)
294                         if os.path.isfile(rf):
295                             res.append(rf)
296                     for i in image:
297                         im = os.path.join(dirname,i.fileref)
298                         if os.path.isfile(im):
299                             res.append(im)
300                     page = dict(title=unicode(doc.article.info.title),
301                                 menu=unicode(doc.article.info.titleabbrev),
302                                 output=os.path.join(dirname,
303                                                     filename.replace('xml','html')),
304                                 exe=exe,
305                                 file=xfile,
306                                 res=res,
307                                 level=level)
308                     for l in sitemap:
309                         if l['link'] == link:
310                             found = 1
311                             l.update(page)
312                     if not found:
313                         print "adding "+link+" to sitemap"
314                         dd = dict(link=link)
315                         dd.update(page)
316                         sitemap.append(dd)
317     sfile = open('sitemap.txt','w')
318     for l in sitemap:
319         sfile.write(l['link']+'\n')
320     sfile.close()
321     return sitemap
322
323 def expandXincludeTxt(page):
324     doc = bindery.parse(page['file'],
325                         prefixes={u'db': u'http://docbook.org/ns/docbook',
326                                   u'xi': u'http://www.w3.org/2001/XInclude'})
327     if page['exe']:
328         code  = doc.xml_select(u"//xi:include[@parse='text']")
329         for c in code:
330             (p, ext) = os.path.splitext(c.href)
331             if ext in valid_scripts:
332                 exe = os.path.join(os.path.abspath(c.href))
333                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
334                 xstr = bindery.parse(str(xml.stdout.read()))
335                 id = c.xml_index_on_parent
336                 for x in xstr.xml_children:
337                     c.xml_parent.xml_insert(id,x)
338                 c.xml_parent.xml_remove(c)
339     return doc
340
341 def xsltConvert(doc):
342 #  amara can not handle the docbook stylesheets
343 #  xmlarticle = transform(doc,style_xslt)
344     cwd = os.getcwd()
345     rundir = os.path.dirname(page['file'])
346     os.chdir(rundir)
347     infile  = os.path.basename(tempfile.mktemp())
348     outfile = tempfile.mktemp()
349     tfi = open(infile,'w')
350     tfi.write(doc.xml_encode())
351     tfi.close()
352 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
353     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
354     retcode = subprocess.call(cmd)
355     if retcode:
356         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
357     tfo = open(outfile,'r')
358     result = tfo.read()
359     tfo.close()
360     os.remove(infile)
361     os.remove(outfile)
362     os.chdir(cwd)
363     return result
364
365 def genMenu(page,sitemap,slevel,elevel):
366     title = None
367     sm = []
368     if elevel == MAXLEVEL or elevel == 1 or page == None:
369         html = '<ul>\n'
370         sm = sitemap
371     else:
372         html = '<ul class="tree">\n'
373         idx = sitemap.index(page)
374         while (sitemap[idx]['level'] == page['level']):
375             idx = idx-1
376         title = sitemap[idx]['menu']
377         idx = idx+1
378         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
379             sm.append(sitemap[idx])
380             idx = idx+1
381     oldlevel = slevel
382
383     for p in sm:
384         if slevel > p['level'] or elevel < p['level']:
385             continue
386         if not title and p['link'] == '/':
387             title = p['menu']
388
389         if oldlevel < p['level']:
390             html+='<ul>\n'
391         elif oldlevel > p['level']:
392             if p['link'][-1] == '/':
393                 html+='</li>\n'
394             html+='</ul>\n</li>\n'
395         if page != None and page == p:
396             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
397         else:
398             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
399         if p['link'][-1] != '/' or p['link'] == '/':
400             html+='</li>\n'
401         oldlevel = p['level']
402     html+='</ul>\n'
403     return (html,title)
404
405 def writeToTemplate(page,doc,sitemap):
406     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
407     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
408     template = Template(file=style_tmpl,
409                         searchList=[{'title':page['title']},
410                                     {'menu':menu},
411                                     {'article':doc},
412                                     {'levelmenu':levelmenu},
413                                     {'levelname':levelname}])
414     outfile = tmptarget+page['output']
415     mkdir_p(os.path.dirname(outfile))
416     out = open(outfile, 'w')
417     out.write(str(template))
418     out.close()
419     for r in page['res']:
420         mkdir_p(os.path.dirname(tmptarget+r))
421         shutil.copyfile(r, tmptarget+r)
422
423 def createSitemap(sitemap):
424     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
425     template = Template(file=style_tmpl,
426                         searchList=[
427             {'title':'Sitemap'},
428             {'menu':menu},
429             {'article':menu},
430             {'levelmenu':''},
431             {'levelname':''}])
432     outfile = tmptarget+'sitemap.en.html'
433     mkdir_p(os.path.dirname(outfile))
434     out = open(outfile, 'w')
435     out.write(str(template))
436     out.close()
437
438 dir_ = Directory()
439 sitemap = Sitemap()
440
441 dir_.scan()
442 sitemap.read_map()
443
444 missing = dir_.set() - sitemap.set()
445 removed = sitemap.set() - dir_.set()
446 for page in removed:
447     print removed+' pages missing!!'
448
449 for page in missing:
450     print 'adding missing page '+page
451     sitemap.add_page(page)
452
453 sitemap.graph()
454
455
456 sitemap = generateSitemap()
457 tmptarget = tempfile.mkdtemp()+'/'
458 for page in sitemap:
459     t1 = time.time()
460     print "Page : %-30s %30s" % (page['link'],
461                         time.ctime(os.stat(page['file']).st_mtime)),
462     doc = expandXincludeTxt(page)
463     pubdoc = xsltConvert(doc)
464     writeToTemplate(page,pubdoc,sitemap)
465     t2 = time.time()
466     print "[%5.2f s]" % (round(t2-t1,2))
467
468 createSitemap(sitemap)
469 publish(tmptarget, args.output)
470 publish(args.style+"css", args.output)
471 publish(args.style+"images",args.output)