6f03ff77307d38894fa5fce72bec2b2f285302ef
[treecutter.git] / treecutter / trie.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 import gettext
15 import shutil
16 from amara import bindery
17 from amara.xslt import transform
18 from Cheetah.Template import Template
19
20 parser = argparse.ArgumentParser(description='Process docbook article tree.')
21 parser.add_argument('--style', nargs='?',
22                     default=os.path.dirname(os.getcwd())+'/style/default/')
23 parser.add_argument('--output', nargs='?',
24                     default=os.path.dirname(os.getcwd())+'/htdocs/')
25 args = parser.parse_args()
26
27 style_xslt = args.style+"docbook.xsl"
28 outputdir = args.output
29
30 tmptarget = tempfile.mkdtemp()+'/'
31
32 valid_scripts = ['.py','.pl']
33 MAXLEVEL = 10000
34
35 def mkdir_p(path):
36     try:
37         os.makedirs(path)
38     except OSError as exc: # Python >2.5
39         if exc.errno == errno.EEXIST:
40             pass
41         else: raise
42
43 def publish(src,target):
44     cmd = ["rsync","-a","--delete",src,target]
45     retcode = subprocess.call(cmd)
46     if retcode:
47         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
48
49 def ssh_cmd(target, command):
50     t = target.split(":")
51     c = command.split()
52     cmd = ["ssh",t[0],c[0],c[1],t[1]]
53     retcode = subprocess.call(cmd)
54     if retcode:
55         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
56
57 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
58           u'xi': u'http://www.w3.org/2001/XInclude',
59           u'xl': u'http://www.w3.org/1999/xlink',
60           u'html' : u'http://www.w3.org/1999/xhtml'}
61
62 class Directory():
63     """Class containing the state of the directory with articles"""
64     def __init__(self):
65         self._cwd = '.'
66         self._tree = []
67
68     def scan(self):
69         for dirname, dirnames, filenames in os.walk(self._cwd):
70             for filename in filenames:
71                 if fnmatch.fnmatch(filename, '*.xml'):
72                     file_ = os.path.join(dirname,filename)
73                     doc = bindery.parse(file_, prefixes=PREFIXES)
74                     title = doc.xml_select(u'/db:article/db:info/db:title')
75                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
76                     if title and menu:
77                         base = file_.split('.')[1]
78                         link = base.replace('index','')
79                         self._tree.append(link)
80
81     def set(self):
82         return set(self._tree)
83
84 class Page():
85     """Class representing a version of a webpage"""
86     def __init__(self,link,page):
87         self._link = link
88         self._file = page[1]
89         self._lang = page[0]
90         self._doc = None
91         self._resources = []
92         self._title = None
93         self._menu = None
94         self._rendered_article = None
95
96     def language(self):
97         return self._lang
98
99     def resources(self):
100         return set(self._resources)
101
102     def menu(self):
103         return self._menu
104
105     def set_article(self,art):
106         self._rendered_article = art
107
108     def prepare(self):
109         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
110         if self._doc.xml_select(u'/db:article/db:info/db:title'):
111             self._title = unicode(self._doc.article.info.title)
112         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
113             self._menu = unicode(self._doc.article.info.titleabbrev)
114
115         dirname = os.path.dirname(self._file)
116         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
117         if code:
118             for c in code:
119                 (p, ext) = os.path.splitext(c.href)
120                 if ext in valid_scripts:
121                     exe = []
122                     exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href))
123                     if c.xml_select(u"//xi:include[@accept-language]"):
124                         alang = c.xml_attributes[None, "accept-language"]
125                         exe.append("lang="+alang)
126                     if c.xml_select(u"//xi:include[@xpointer]"):
127                         exe.append("xptr="+c.xpointer)
128                     xml = subprocess.Popen(exe,stdout=subprocess.PIPE)
129                     xstr = bindery.parse(str(xml.stdout.read()))
130                     idp = c.xml_index_on_parent
131                     for x in xstr.xml_children:
132                         c.xml_parent.xml_insert(idp,x)
133                         c.xml_parent.xml_remove(c)
134
135         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
136             rf = os.path.join(dirname,r.href)
137             if os.path.isfile(rf):
138                 self._resources.append(rf)
139         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
140             im = os.path.join(dirname,i.fileref)
141             if os.path.isfile(im):
142                 self._resources.append(im)
143         for i in self._doc.xml_select(u"//html:form[@action]"):
144             pyscript = re.split('\.py',i.action,1)[0]+'.py'
145             im = os.path.join(dirname,pyscript)
146             if os.path.isfile(im):
147                 self._resources.append(im)
148
149     def render(self):
150         #  amara can not handle the docbook stylesheets
151         #  xmlarticle = transform(doc,style_xslt)
152         cwd = os.getcwd()
153         dirname = os.path.dirname(self._file)
154         os.chdir(dirname)
155         infile  = os.path.basename(tempfile.mktemp())
156         outfile = tempfile.mktemp()
157         tfi = open(infile,'w')
158         tfi.write(self._doc.xml_encode(omit_xml_declaration=True))
159         tfi.close()
160 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
161         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
162         retcode = subprocess.call(cmd)
163         if retcode:
164             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
165         tfo = open(outfile,'r')
166         self._rendered_article = tfo.read()
167         tfo.close()
168         os.remove(infile)
169         os.remove(outfile)
170         os.chdir(cwd)
171
172     def template(self,sitemap):
173         htmlmenu =  sitemap.gen_menu(self._lang,None,"menu")
174         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
175         langmenu = sitemap.lang_menu(self._lang,self._link)
176         template = Template(file=args.style+'index.'+self._lang+'.html.tmpl',
177                             searchList=[{'title':self._title},
178                                         {'menu':htmlmenu},
179                                         {'article':self._rendered_article},
180                                         {'levelmenu':levelmenu},
181                                         {'langmenu':langmenu}])
182         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
183         mkdir_p(os.path.dirname(outfile))
184         out = open(outfile, 'w')
185         out.write(str(template))
186         out.close()
187
188
189 class Link():
190     """Class representing a webpage on the site"""
191     def __init__(self,link):
192         self._link = link
193         # find the representations of the link.
194         self._pages = []
195         path = link
196         if self._link[-1] == '/':
197             path = path+'index'
198         lang = self._scan_languages(path)
199         for l in lang:
200             self._pages.append(Page(self,l))
201
202     def add_page(self,l):
203         self._pages.append(Page(self,l))
204
205     def _scan_languages(self,path):
206         lang = []
207         for l in  glob.glob('.'+path+'*'):
208             ls = l.split('.')
209             if len(ls) > 3 and ls[3] == 'xml':
210                 lang.append((ls[2],l))
211         return lang
212
213     def link(self):
214         return self._link
215
216     def prepare(self):
217         for page in self._pages:
218             page.prepare()
219
220     def languages(self):
221         p = []
222         for page in self._pages:
223             p.append(page.language())
224         return p
225
226     def render(self):
227         for page in self._pages:
228             page.render()
229
230     def template(self,sitemap):
231         for page in self._pages:
232             page.template(sitemap)
233
234     def page(self,lang):
235         for page in self._pages:
236             if page.language()==lang:
237                 return page
238         return None
239
240     def resources(self):
241         res  = set()
242         for page in self._pages:
243             res = res.union(page.resources())
244         return res
245
246
247 class Node():
248     def __init__(self,token,value):
249         self._token = token
250         self._value = value
251         self._children = []
252
253     def token(self):
254         return self._token
255
256     def value(self):
257         return self._value
258
259     def children(self):
260         return self._children
261
262 class Trie():
263     def __init__(self):
264         self._root = []
265
266     def __iter__(self):
267         return self.inorder(self._root)
268
269     def inorder(self,t):
270         for l in t:
271             yield l.value()
272             for x in self.inorder(l.children()):
273                 yield x
274
275     def _add(self,trie, key, content):
276         # is the key a leaf
277         k = key.pop(0)
278         if key == []:
279             node = Node(k,content)
280             trie.append(node)
281         else:
282             for ch in trie:
283                 if ch.token() == k:
284                     self._add(ch.children(), key, content)
285
286     def add(self,key, content):
287         self._add(self._root, key, content)
288
289     def _graph(self, trie, G):
290         for l in trie:
291             G.add_node(l.token())
292             for ch in l.children():
293                 G.add_edge(l.token(),ch.token())
294                 self._graph(l.children(), G)
295
296     def graph(self):
297         G = pgv.AGraph(directed=True)
298         G.add_node("sitemap")
299         for ch in self._root:
300             G.add_edge("sitemap",ch.token())
301         self._graph(self._root, G)
302 #        G.layout('dot')
303 #        G.draw('g.png')
304 #        print G.string()
305
306     def _menu(self, trie, lang, page, css):
307         html = "<ul%s>\n" % css
308         for l in trie:
309             sel = ''
310             p = l.value().page(lang)
311             if p == page:
312                 sel = ' class="selected"'
313             if p != None:
314                 html += '<li%s><a href="%s">%s</a>\n' \
315                     % (sel,l.value().link(),p.menu())
316             else:
317                 html += '<li%s><a href="%s.en" hreflang="en">%s</a>*\n' \
318                     % (sel,l.value().link(), l.value().page('en').menu())
319             if l.children():
320                 html += self._menu(l.children(), lang, page, "")
321         html += "</ul>\n"
322         return html
323
324     def menu(self,lang,page,cssclass):
325         css = ''
326         if cssclass:
327             css = ' class="'+cssclass+'"'
328         return self._menu(self._root, lang, page, css)
329
330 class Sitemap():
331     """Class keeping the internal site structure"""
332     def __init__(self):
333         self._file = 'sitemap.txt'
334         self._tree = Trie()
335         self._sitelang = set()
336         self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
337         self._tranlang = {}
338
339     def add_link(self, link):
340         tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link))
341         self._tree.add(tokens,Link(link))
342
343     def write_map(self):
344         f = open(self._file,'w')
345         f.write('\n'.join(link.link() for link in self._tree))
346         f.close()
347
348     def read_map(self):
349         try:
350             f = open(self._file)
351             sml = f.read().split()
352             f.close()
353             for line in sml:
354                 self.add_link(line)
355         except IOError, what_error:
356             print 'INFO: Could not read sitemap.txt - one will be created'
357
358     def set(self):
359         return set(link.link() for link in self._tree)
360
361     def process(self):
362         t1 = time.time()
363         for link in self._tree:
364             link.prepare()
365         t2 = time.time()
366         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
367         for link in self._tree:
368             self._sitelang = self._sitelang.union(set(link.languages()))
369         for tran in self._sitelang:
370             if tran != 'en':
371                 self._tranlang[tran] = gettext.translation('iso_639_3',
372                                                            languages=[tran])
373         t3 = time.time()
374         print "Language [%5.2f s]" % (round(t3-t2,2))
375         for link in self._tree:
376             link.render()
377         t4 = time.time()
378         print "Render   [%5.2f s]" % (round(t4-t3,2))
379         for link in self._tree:
380             link.template(self)
381         t5 = time.time()
382         print "Template [%5.2f s]" % (round(t5-t4,2))
383         t6 = time.time()
384         res = set()
385         cwd = os.getcwd()
386         for link in self._tree:
387             res = res.union(link.resources())
388         for f in res:
389             outfile = tmptarget+f
390             mkdir_p(os.path.dirname(outfile))
391             shutil.copyfile(f,outfile)
392         print "Resources[%5.2f s]" % (round(t6-t5,2))
393         sitmaplink = Link('/sitemap')
394         for l in self._sitelang:
395             sitmaplink.add_page((l,'/sitemap.'+l+'.xml'))
396         for l in self._sitelang:
397             sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap"))
398             sitmaplink.page(l).template(self)
399         t7 = time.time()
400         print "Sitemap  [%5.2f s]" % (round(t7-t6,2))
401
402     def graph(self):
403         self._tree.graph()
404
405     def gen_menu(self,lang,page,cssclass):
406         return self._tree.menu(lang,page,cssclass)
407
408     def lang_menu(self,lang,link):
409         html = "<ul>"
410         for l in link.languages():
411             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
412             ln = self._isocode.xml_select(isoxml)[0].name
413             if lang != 'en':
414                 ln = self._tranlang[lang].gettext(ln)
415             p = link.link()
416             if p[-1] == '/':
417                 p = p +'index'
418             p = p+'.'+l
419             html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
420         html += "</ul>"
421         return html
422
423     def publish(self):
424         ssh_cmd(args.output,"mkdir -p")
425         publish(tmptarget, args.output)
426         for res in ["css","images","js","favicon.ico"]:
427             if (os.path.exists(args.style+res)):
428                 publish(args.style+res, args.output)
429         ssh_cmd(args.output,"chmod a+rx")
430
431 ts = time.time()
432 dir_ = Directory()
433 sitemap = Sitemap()
434
435 dir_.scan()
436 sitemap.read_map()
437
438 missing = dir_.set() - sitemap.set()
439 removed = sitemap.set() - dir_.set()
440 for page in removed:
441     print page+' pages missing!!'
442 for page in missing:
443     print 'adding missing page '+page
444     sitemap.add_link(page)
445 if len(missing)+len(removed) != 0:
446     print 'writing new sitemap - please adjust if needed'
447     sitemap.write_map()
448 sitemap.graph()
449
450 sitemap.process()
451
452 t1 = time.time()
453 sitemap.publish()
454 t2 = time.time()
455 print "Publish  [%5.2f s]" % (round(t2-t1,2))
456 print "Total    [%5.2f s]" % (round(t2-ts,2))