Adding some getters and setters to Class Page.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 from amara import bindery
15 from amara.xslt import transform
16 from Cheetah.Template import Template
17
18 parser = argparse.ArgumentParser(description='Process docbook article tree.')
19 parser.add_argument('--style', nargs='?',
20                     default=os.path.dirname(os.getcwd())+'/style/default/')
21 parser.add_argument('--output', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/htdocs/')
23 args = parser.parse_args()
24
25 style_xslt = args.style+"docbook.xsl"
26 style_tmpl = args.style+"index.en.html.tmpl"
27 outputdir = args.output
28
29 tmptarget = tempfile.mkdtemp()+'/'
30
31 valid_scripts = ['.py','.pl']
32 MAXLEVEL = 10000
33
34 def mkdir_p(path):
35     try:
36         os.makedirs(path)
37     except OSError as exc: # Python >2.5
38         if exc.errno == errno.EEXIST:
39             pass
40         else: raise
41
42 def publish(src,target):
43     cmd = ["rsync","-a","--delete",src,target]
44     retcode = subprocess.call(cmd)
45     if retcode:
46         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
47
48
49 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
50           u'xi': u'http://www.w3.org/2001/XInclude',
51           u'xl': u'http://www.w3.org/1999/xlink'}
52
53 class Directory():
54     """Class containing the state of the directory with articles"""
55     def __init__(self):
56         self._cwd = '.'
57         self._tree = []
58
59     def scan(self):
60         for dirname, dirnames, filenames in os.walk(self._cwd):
61             for filename in filenames:
62                 if fnmatch.fnmatch(filename, '*.xml'):
63                     file_ = os.path.join(dirname,filename)
64                     doc = bindery.parse(file_, prefixes=PREFIXES)
65                     title = doc.xml_select(u'/db:article/db:info/db:title')
66                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
67                     if title and menu:
68                         base = file_.split('.')[1]
69                         link = base.replace('index','')
70                         self._tree.append(link)
71
72     def set(self):
73         return set(self._tree)
74
75 class Page():
76     """Class representing a version of a webpage"""
77     def __init__(self,page):
78         self._file = page[1]
79         self._lang = page[0]
80         self._doc = None
81         self._resources = []
82         self._title = None
83         self._menu = None
84         self._rendered_article = None
85
86     def language(self):
87         return self._lang
88
89     def menu(self):
90         return self._menu
91
92     def set_article(self,art):
93         self._rendered_article = art
94
95     def prepare(self):
96         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
97         if self._doc.xml_select(u'/db:article/db:info/db:title'):
98             self._title = unicode(self._doc.article.info.title)
99         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
100             self._menu = unicode(self._doc.article.info.titleabbrev)
101
102         dirname = os.path.dirname(self._file)
103         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
104         if code:
105             for c in code:
106                 (p, ext) = os.path.splitext(c.href)
107                 if ext in valid_scripts:
108                     exe = os.path.join(os.path.abspath(dirname+c.href))
109                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
110                     xstr = bindery.parse(str(xml.stdout.read()))
111                     idp = c.xml_index_on_parent
112                     for x in xstr.xml_children:
113                         c.xml_parent.xml_insert(idp,x)
114                         c.xml_parent.xml_remove(c)
115
116         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
117             rf = os.path.join(dirname,r.href)
118             if os.path.isfile(rf):
119                 self._resources.append(rf)
120         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
121             im = os.path.join(dirname,i.fileref)
122             if os.path.isfile(im):
123                 self._resources.append(im)
124
125     def render(self):
126         #  amara can not handle the docbook stylesheets
127         #  xmlarticle = transform(doc,style_xslt)
128         cwd = os.getcwd()
129         dirname = os.path.dirname(self._file)
130         os.chdir(dirname)
131         infile  = os.path.basename(tempfile.mktemp())
132         outfile = tempfile.mktemp()
133         tfi = open(infile,'w')
134         tfi.write(self._doc.xml_encode())
135         tfi.close()
136 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
137         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
138         retcode = subprocess.call(cmd)
139         if retcode:
140             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
141         tfo = open(outfile,'r')
142         self._rendered_article = tfo.read()
143         tfo.close()
144         os.remove(infile)
145         os.remove(outfile)
146         os.chdir(cwd)
147
148     def template(self,sitemap):
149         htmlmenu =  sitemap.gen_menu(self._lang,None)
150         levelmenu = sitemap.gen_menu(self._lang,self)
151         template = Template(file=style_tmpl,
152                             searchList=[{'title':self._title},
153                                         {'menu':htmlmenu},
154                                         {'article':self._rendered_article},
155                                         {'levelmenu':levelmenu},
156                                         {'levelname':levelname}])
157         outfile = tmptarget+self._file+'.'+self._lang+'.html'
158         mkdir_p(os.path.dirname(outfile))
159         out = open(outfile, 'w')
160         out.write(str(template))
161         out.close()
162
163
164 class Link():
165     """Class representing a webpage on the site"""
166     def __init__(self,link):
167         self._link = link
168         # find the representations of the link.
169         self._pages = []
170         path = link
171         if self._link[-1] == '/':
172             path = path+'index'
173         lang = self._scan_languages(path)
174         for l in lang:
175             self._pages.append(Page(l))
176
177     def _scan_languages(self,path):
178         lang = []
179         for l in  glob.glob('.'+path+'*'):
180             ls = l.split('.')
181             if len(ls) > 3 and ls[3] == 'xml':
182                 lang.append((ls[2],l))
183         return lang
184
185     def link(self):
186         return self._link
187
188 class Node():
189     def __init__(self,token,value):
190         self._token = token
191         self._value = value
192         self._children = []
193
194     def token(self):
195         return self._token
196
197     def value(self):
198         return self._value
199
200     def children(self):
201         return self._children
202
203 class Trie():
204     def __init__(self):
205         self._root = []
206
207     def __iter__(self):
208         return self.inorder(self._root)
209
210     def inorder(self,t):
211         for l in t:
212             yield l.value()
213             for ch in l.children():
214                 self.inorder(ch)
215
216     def _add(self,trie, key, content):
217         # is the key a leaf
218         k = key.pop(0)
219         if key == []:
220             node = Node(k,content)
221             trie.append(node)
222         else:
223             for ch in trie:
224                 if ch.token() == k:
225                     self._add(ch.children(), key, content)
226
227     def add(self,key, content):
228         self._add(self._root, key, content)
229
230     def _graph(self, trie, G):
231         for l in trie:
232             G.add_node(l.token())
233             for ch in l.children():
234                 G.add_edge(l.token(),ch.token())
235                 self._graph(l.children(), G)
236
237     def graph(self):
238         G = pgv.AGraph(directed=True)
239         G.add_node("sitemap")
240         for ch in self._root:
241             G.add_edge("sitemap",ch.token())
242         self._graph(self._root, G)
243 #        G.layout('dot')
244 #        G.draw('g.png')
245 #        print G.string()
246
247 class Sitemap():
248     """Class keeping the internal site structure"""
249     def __init__(self):
250         self._file = 'sitemap.txt'
251         self._tree = Trie()
252
253     def add_link(self, link):
254         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
255         self._tree.add(tokens,Link(link))
256
257     def read_map(self):
258         try:
259             f = open(self._file)
260             sml = f.read().split()
261             f.close()
262             for line in sml:
263                 self.add_link(line)
264         except IOError, what_error:
265             print 'INFO: Could not read sitemap.txt - one will be created'
266
267     def set(self):
268         return set(link.link() for link in self._tree)
269
270     def graph(self):
271         self._tree.graph()
272
273     def gen_menu(self,lang,page):
274         return 'Generate menu from sitemap - To be implemented'
275
276 def generateSitemap():
277     sitemap = []
278     try:
279         sfile = open('sitemap.txt')
280         flist = sfile.read().split()
281         sfile.close()
282         for f in flist:
283             sitemap.append(dict(link=f))
284     except IOError, what_error:
285         print 'Sitemap missing - generating one.'
286
287     for dirname, dirnames, filenames in os.walk('.'):
288         for filename in filenames:
289             if fnmatch.fnmatch(filename, '*.xml'):
290                 xfile = os.path.join(dirname,filename)
291                 doc = bindery.parse(xfile,
292                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
293                                               u'xi': u'http://www.w3.org/2001/XInclude',
294                                               u'xl': u'http://www.w3.org/1999/xlink'})
295                 title = doc.xml_select(u'/db:article/db:info/db:title')
296                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
297                 code  = doc.xml_select(u"//xi:include[@parse='text']")
298                 resource = doc.xml_select(u"//db:link[@xl:href]")
299                 image = doc.xml_select(u"//db:imagedata[@fileref]")
300                 exe = 0
301                 for c in code:
302                     (p, ext) = os.path.splitext(c.href)
303                     if ext in valid_scripts:
304                         exe = 1
305
306                 if title and menu:
307                     found = 0
308                     base = xfile.split('.')[1]
309                     link = base.replace('index','')
310                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
311                     res = []
312                     for r in resource:
313                         rf = os.path.join(dirname,r.href)
314                         if os.path.isfile(rf):
315                             res.append(rf)
316                     for i in image:
317                         im = os.path.join(dirname,i.fileref)
318                         if os.path.isfile(im):
319                             res.append(im)
320                     page = dict(title=unicode(doc.article.info.title),
321                                 menu=unicode(doc.article.info.titleabbrev),
322                                 output=os.path.join(dirname,
323                                                     filename.replace('xml','html')),
324                                 exe=exe,
325                                 file=xfile,
326                                 res=res,
327                                 level=level)
328                     for l in sitemap:
329                         if l['link'] == link:
330                             found = 1
331                             l.update(page)
332                     if not found:
333                         print "adding "+link+" to sitemap"
334                         dd = dict(link=link)
335                         dd.update(page)
336                         sitemap.append(dd)
337     sfile = open('sitemap.txt','w')
338     for l in sitemap:
339         sfile.write(l['link']+'\n')
340     sfile.close()
341     return sitemap
342
343 def expandXincludeTxt(page):
344     doc = bindery.parse(page['file'],
345                         prefixes={u'db': u'http://docbook.org/ns/docbook',
346                                   u'xi': u'http://www.w3.org/2001/XInclude'})
347     if page['exe']:
348         code  = doc.xml_select(u"//xi:include[@parse='text']")
349         for c in code:
350             (p, ext) = os.path.splitext(c.href)
351             if ext in valid_scripts:
352                 exe = os.path.join(os.path.abspath(c.href))
353                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
354                 xstr = bindery.parse(str(xml.stdout.read()))
355                 id = c.xml_index_on_parent
356                 for x in xstr.xml_children:
357                     c.xml_parent.xml_insert(id,x)
358                 c.xml_parent.xml_remove(c)
359     return doc
360
361 def xsltConvert(doc):
362 #  amara can not handle the docbook stylesheets
363 #  xmlarticle = transform(doc,style_xslt)
364     cwd = os.getcwd()
365     rundir = os.path.dirname(page['file'])
366     os.chdir(rundir)
367     infile  = os.path.basename(tempfile.mktemp())
368     outfile = tempfile.mktemp()
369     tfi = open(infile,'w')
370     tfi.write(doc.xml_encode())
371     tfi.close()
372 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
373     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
374     retcode = subprocess.call(cmd)
375     if retcode:
376         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
377     tfo = open(outfile,'r')
378     result = tfo.read()
379     tfo.close()
380     os.remove(infile)
381     os.remove(outfile)
382     os.chdir(cwd)
383     return result
384
385 def genMenu(page,sitemap,slevel,elevel):
386     title = None
387     sm = []
388     if elevel == MAXLEVEL or elevel == 1 or page == None:
389         html = '<ul>\n'
390         sm = sitemap
391     else:
392         html = '<ul class="tree">\n'
393         idx = sitemap.index(page)
394         while (sitemap[idx]['level'] == page['level']):
395             idx = idx-1
396         title = sitemap[idx]['menu']
397         idx = idx+1
398         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
399             sm.append(sitemap[idx])
400             idx = idx+1
401     oldlevel = slevel
402
403     for p in sm:
404         if slevel > p['level'] or elevel < p['level']:
405             continue
406         if not title and p['link'] == '/':
407             title = p['menu']
408
409         if oldlevel < p['level']:
410             html+='<ul>\n'
411         elif oldlevel > p['level']:
412             if p['link'][-1] == '/':
413                 html+='</li>\n'
414             html+='</ul>\n</li>\n'
415         if page != None and page == p:
416             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
417         else:
418             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
419         if p['link'][-1] != '/' or p['link'] == '/':
420             html+='</li>\n'
421         oldlevel = p['level']
422     html+='</ul>\n'
423     return (html,title)
424
425 def writeToTemplate(page,doc,sitemap):
426     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
427     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
428     template = Template(file=style_tmpl,
429                         searchList=[{'title':page['title']},
430                                     {'menu':menu},
431                                     {'article':doc},
432                                     {'levelmenu':levelmenu},
433                                     {'levelname':levelname}])
434     outfile = tmptarget+page['output']
435     mkdir_p(os.path.dirname(outfile))
436     out = open(outfile, 'w')
437     out.write(str(template))
438     out.close()
439     for r in page['res']:
440         mkdir_p(os.path.dirname(tmptarget+r))
441         shutil.copyfile(r, tmptarget+r)
442
443 def createSitemap(sitemap):
444     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
445     template = Template(file=style_tmpl,
446                         searchList=[
447             {'title':'Sitemap'},
448             {'menu':menu},
449             {'article':menu},
450             {'levelmenu':''},
451             {'levelname':''}])
452     outfile = tmptarget+'sitemap.en.html'
453     mkdir_p(os.path.dirname(outfile))
454     out = open(outfile, 'w')
455     out.write(str(template))
456     out.close()
457
458 dir_ = Directory()
459 sitemap = Sitemap()
460
461 dir_.scan()
462 sitemap.read_map()
463
464 missing = dir_.set() - sitemap.set()
465 removed = sitemap.set() - dir_.set()
466 for page in removed:
467     print removed+' pages missing!!'
468
469 for page in missing:
470     print 'adding missing page '+page
471     sitemap.add_page(page)
472
473 sitemap.graph()
474
475
476 sitemap = generateSitemap()
477 tmptarget = tempfile.mkdtemp()+'/'
478 for page in sitemap:
479     t1 = time.time()
480     print "Page : %-30s %30s" % (page['link'],
481                         time.ctime(os.stat(page['file']).st_mtime)),
482     doc = expandXincludeTxt(page)
483     pubdoc = xsltConvert(doc)
484     writeToTemplate(page,pubdoc,sitemap)
485     t2 = time.time()
486     print "[%5.2f s]" % (round(t2-t1,2))
487
488 createSitemap(sitemap)
489 publish(tmptarget, args.output)
490 publish(args.style+"css", args.output)
491 publish(args.style+"images",args.output)