Adding omit_xml_declaration=True to amara call, but does not yet work, considering...
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 import gettext
15 import shutil
16 from amara import bindery
17 from amara.xslt import transform
18 from Cheetah.Template import Template
19
20 parser = argparse.ArgumentParser(description='Process docbook article tree.')
21 parser.add_argument('--style', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/style/default/')
23 parser.add_argument('--output', nargs='?',
24                     default=os.path.dirname(os.getcwd())+'/htdocs/')
25 args = parser.parse_args()
26
27 style_xslt = args.style+"docbook.xsl"
28 outputdir = args.output
29
30 tmptarget = tempfile.mkdtemp()+'/'
31
32 valid_scripts = ['.py','.pl']
33 MAXLEVEL = 10000
34
35 def mkdir_p(path):
36     try:
37         os.makedirs(path)
38     except OSError as exc: # Python >2.5
39         if exc.errno == errno.EEXIST:
40             pass
41         else: raise
42
43 def publish(src,target):
44     cmd = ["rsync","-a","--delete",src,target]
45     retcode = subprocess.call(cmd)
46     if retcode:
47         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
48
49 def ssh_cmd(target, command):
50     t = target.split(":")
51     c = command.split()
52     cmd = ["ssh",t[0],c[0],c[1],t[1]]
53     retcode = subprocess.call(cmd)
54     if retcode:
55         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
56
57 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
58           u'xi': u'http://www.w3.org/2001/XInclude',
59           u'xl': u'http://www.w3.org/1999/xlink',
60           u'html' : u'http://www.w3.org/1999/xhtml'}
61
62 class Directory():
63     """Class containing the state of the directory with articles"""
64     def __init__(self):
65         self._cwd = '.'
66         self._tree = []
67
68     def scan(self):
69         for dirname, dirnames, filenames in os.walk(self._cwd):
70             for filename in filenames:
71                 if fnmatch.fnmatch(filename, '*.xml'):
72                     file_ = os.path.join(dirname,filename)
73                     doc = bindery.parse(file_, prefixes=PREFIXES)
74                     title = doc.xml_select(u'/db:article/db:info/db:title')
75                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
76                     if title and menu:
77                         base = file_.split('.')[1]
78                         link = base.replace('index','')
79                         self._tree.append(link)
80
81     def set(self):
82         return set(self._tree)
83
84 class Page():
85     """Class representing a version of a webpage"""
86     def __init__(self,link,page):
87         self._link = link
88         self._file = page[1]
89         self._lang = page[0]
90         self._doc = None
91         self._resources = []
92         self._title = None
93         self._menu = None
94         self._rendered_article = None
95
96     def language(self):
97         return self._lang
98
99     def resources(self):
100         return set(self._resources)
101
102     def menu(self):
103         return self._menu
104
105     def set_article(self,art):
106         self._rendered_article = art
107
108     def prepare(self):
109         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
110         if self._doc.xml_select(u'/db:article/db:info/db:title'):
111             self._title = unicode(self._doc.article.info.title)
112         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
113             self._menu = unicode(self._doc.article.info.titleabbrev)
114
115         dirname = os.path.dirname(self._file)
116         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
117         if code:
118             for c in code:
119                 (p, ext) = os.path.splitext(c.href)
120                 if ext in valid_scripts:
121                     exe = os.path.join(os.path.abspath(dirname)+'/'+c.href)
122                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
123                     xstr = bindery.parse(str(xml.stdout.read()))
124                     idp = c.xml_index_on_parent
125                     for x in xstr.xml_children:
126                         c.xml_parent.xml_insert(idp,x)
127                         c.xml_parent.xml_remove(c)
128
129         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
130             rf = os.path.join(dirname,r.href)
131             if os.path.isfile(rf):
132                 self._resources.append(rf)
133         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
134             im = os.path.join(dirname,i.fileref)
135             if os.path.isfile(im):
136                 self._resources.append(im)
137         for i in self._doc.xml_select(u"//html:form[@action]"):
138             pyscript = re.split('\.py',i.action,1)[0]+'.py'
139             im = os.path.join(dirname,pyscript)
140             if os.path.isfile(im):
141                 self._resources.append(im)
142
143     def render(self):
144         #  amara can not handle the docbook stylesheets
145         #  xmlarticle = transform(doc,style_xslt)
146         cwd = os.getcwd()
147         dirname = os.path.dirname(self._file)
148         os.chdir(dirname)
149         infile  = os.path.basename(tempfile.mktemp())
150         outfile = tempfile.mktemp()
151         tfi = open(infile,'w')
152         tfi.write(self._doc.xml_encode(omit_xml_declaration=True))
153         tfi.close()
154 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
155         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
156         retcode = subprocess.call(cmd)
157         if retcode:
158             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
159         tfo = open(outfile,'r')
160         self._rendered_article = tfo.read()
161         tfo.close()
162         os.remove(infile)
163         os.remove(outfile)
164         os.chdir(cwd)
165
166     def template(self,sitemap):
167         htmlmenu =  sitemap.gen_menu(self._lang,None,"menu")
168         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
169         langmenu = sitemap.lang_menu(self._lang,self._link)
170         template = Template(file=args.style+'index.'+self._lang+'.html.tmpl',
171                             searchList=[{'title':self._title},
172                                         {'menu':htmlmenu},
173                                         {'article':self._rendered_article},
174                                         {'levelmenu':levelmenu},
175                                         {'langmenu':langmenu}])
176         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
177         mkdir_p(os.path.dirname(outfile))
178         out = open(outfile, 'w')
179         out.write(str(template))
180         out.close()
181
182
183 class Link():
184     """Class representing a webpage on the site"""
185     def __init__(self,link):
186         self._link = link
187         # find the representations of the link.
188         self._pages = []
189         path = link
190         if self._link[-1] == '/':
191             path = path+'index'
192         lang = self._scan_languages(path)
193         for l in lang:
194             self._pages.append(Page(self,l))
195
196     def add_page(self,l):
197         self._pages.append(Page(self,l))
198
199     def _scan_languages(self,path):
200         lang = []
201         for l in  glob.glob('.'+path+'*'):
202             ls = l.split('.')
203             if len(ls) > 3 and ls[3] == 'xml':
204                 lang.append((ls[2],l))
205         return lang
206
207     def link(self):
208         return self._link
209
210     def prepare(self):
211         for page in self._pages:
212             page.prepare()
213
214     def languages(self):
215         p = []
216         for page in self._pages:
217             p.append(page.language())
218         return p
219
220     def render(self):
221         for page in self._pages:
222             page.render()
223
224     def template(self,sitemap):
225         for page in self._pages:
226             page.template(sitemap)
227
228     def page(self,lang):
229         for page in self._pages:
230             if page.language()==lang:
231                 return page
232         return None
233
234     def resources(self):
235         res  = set()
236         for page in self._pages:
237             res = res.union(page.resources())
238         return res
239
240
241 class Node():
242     def __init__(self,token,value):
243         self._token = token
244         self._value = value
245         self._children = []
246
247     def token(self):
248         return self._token
249
250     def value(self):
251         return self._value
252
253     def children(self):
254         return self._children
255
256 class Trie():
257     def __init__(self):
258         self._root = []
259
260     def __iter__(self):
261         return self.inorder(self._root)
262
263     def inorder(self,t):
264         for l in t:
265             yield l.value()
266             for x in self.inorder(l.children()):
267                 yield x
268
269     def _add(self,trie, key, content):
270         # is the key a leaf
271         k = key.pop(0)
272         if key == []:
273             node = Node(k,content)
274             trie.append(node)
275         else:
276             for ch in trie:
277                 if ch.token() == k:
278                     self._add(ch.children(), key, content)
279
280     def add(self,key, content):
281         self._add(self._root, key, content)
282
283     def _graph(self, trie, G):
284         for l in trie:
285             G.add_node(l.token())
286             for ch in l.children():
287                 G.add_edge(l.token(),ch.token())
288                 self._graph(l.children(), G)
289
290     def graph(self):
291         G = pgv.AGraph(directed=True)
292         G.add_node("sitemap")
293         for ch in self._root:
294             G.add_edge("sitemap",ch.token())
295         self._graph(self._root, G)
296 #        G.layout('dot')
297 #        G.draw('g.png')
298 #        print G.string()
299
300     def _menu(self, trie, lang, page, css):
301         html = "<ul%s>\n" % css
302         for l in trie:
303             sel = ''
304             p = l.value().page(lang)
305             if p == page:
306                 sel = ' class="selected"'
307             if p != None:
308                 html += '<li%s><a href="%s">%s</a>\n' \
309                     % (sel,l.value().link(),p.menu())
310             else:
311                 html += '<li%s><a href="%s.en" hreflang="en">%s</a>*\n' \
312                     % (sel,l.value().link(), l.value().page('en').menu())
313             if l.children():
314                 html += self._menu(l.children(), lang, page, "")
315         html += "</ul>\n"
316         return html
317
318     def menu(self,lang,page,cssclass):
319         css = ''
320         if cssclass:
321             css = ' class="'+cssclass+'"'
322         return self._menu(self._root, lang, page, css)
323
324 class Sitemap():
325     """Class keeping the internal site structure"""
326     def __init__(self):
327         self._file = 'sitemap.txt'
328         self._tree = Trie()
329         self._sitelang = set()
330         self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
331         self._tranlang = {}
332
333     def add_link(self, link):
334         tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link))
335         self._tree.add(tokens,Link(link))
336
337     def write_map(self):
338         f = open(self._file,'w')
339         f.write('\n'.join(link.link() for link in self._tree))
340         f.close()
341
342     def read_map(self):
343         try:
344             f = open(self._file)
345             sml = f.read().split()
346             f.close()
347             for line in sml:
348                 self.add_link(line)
349         except IOError, what_error:
350             print 'INFO: Could not read sitemap.txt - one will be created'
351
352     def set(self):
353         return set(link.link() for link in self._tree)
354
355     def process(self):
356         t1 = time.time()
357         for link in self._tree:
358             link.prepare()
359         t2 = time.time()
360         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
361         for link in self._tree:
362             self._sitelang = self._sitelang.union(set(link.languages()))
363         for tran in self._sitelang:
364             if tran != 'en':
365                 self._tranlang[tran] = gettext.translation('iso_639_3',
366                                                            languages=[tran])
367         t3 = time.time()
368         print "Language [%5.2f s]" % (round(t3-t2,2))
369         for link in self._tree:
370             link.render()
371         t4 = time.time()
372         print "Render   [%5.2f s]" % (round(t4-t3,2))
373         for link in self._tree:
374             link.template(self)
375         t5 = time.time()
376         print "Template [%5.2f s]" % (round(t5-t4,2))
377         t6 = time.time()
378         res = set()
379         cwd = os.getcwd()
380         for link in self._tree:
381             res = res.union(link.resources())
382         for f in res:
383             outfile = tmptarget+f
384             mkdir_p(os.path.dirname(outfile))
385             shutil.copyfile(f,outfile)
386         print "Resources[%5.2f s]" % (round(t6-t5,2))
387         sitmaplink = Link('/sitemap')
388         for l in self._sitelang:
389             sitmaplink.add_page((l,'/sitemap.'+l+'.xml'))
390         for l in self._sitelang:
391             sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap"))
392             sitmaplink.page(l).template(self)
393         t7 = time.time()
394         print "Sitemap  [%5.2f s]" % (round(t7-t6,2))
395
396     def graph(self):
397         self._tree.graph()
398
399     def gen_menu(self,lang,page,cssclass):
400         return self._tree.menu(lang,page,cssclass)
401
402     def lang_menu(self,lang,link):
403         html = "<ul>"
404         for l in link.languages():
405             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
406             ln = self._isocode.xml_select(isoxml)[0].name
407             if lang != 'en':
408                 ln = self._tranlang[lang].gettext(ln)
409             p = link.link()
410             if p[-1] == '/':
411                 p = p +'index'
412             p = p+'.'+l
413             html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
414         html += "</ul>"
415         return html
416
417     def publish(self):
418         ssh_cmd(args.output,"mkdir -p")
419         publish(tmptarget, args.output)
420         for res in ["css","images","js","favicon.ico"]:
421             if (os.path.exists(args.style+res)):
422                 publish(args.style+res, args.output)
423         ssh_cmd(args.output,"chmod a+rx")
424
425 ts = time.time()
426 dir_ = Directory()
427 sitemap = Sitemap()
428
429 dir_.scan()
430 sitemap.read_map()
431
432 missing = dir_.set() - sitemap.set()
433 removed = sitemap.set() - dir_.set()
434 for page in removed:
435     print page+' pages missing!!'
436 for page in missing:
437     print 'adding missing page '+page
438     sitemap.add_link(page)
439 if len(missing)+len(removed) != 0:
440     print 'writing new sitemap - please adjust if needed'
441     sitemap.write_map()
442 sitemap.graph()
443
444 sitemap.process()
445
446 t1 = time.time()
447 sitemap.publish()
448 t2 = time.time()
449 print "Publish  [%5.2f s]" % (round(t2-t1,2))
450 print "Total    [%5.2f s]" % (round(t2-ts,2))