97ac6a2a65f15fdb7bf00c3fa09ba7c12b3f7ff2
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 from amara import bindery
14 from amara.xslt import transform
15 from Cheetah.Template import Template
16
17 parser = argparse.ArgumentParser(description='Process docbook article tree.')
18 parser.add_argument('--style', nargs='?',
19                     default=os.path.dirname(os.getcwd())+'/style/default/')
20 parser.add_argument('--output', nargs='?',
21                     default=os.path.dirname(os.getcwd())+'/htdocs/')
22 args = parser.parse_args()
23
24 style_xslt = args.style+"docbook.xsl"
25 style_tmpl = args.style+"index.en.html.tmpl"
26 outputdir = args.output
27
28 valid_scripts = ['.py','.pl']
29 MAXLEVEL = 10000
30
31 def mkdir_p(path):
32     try:
33         os.makedirs(path)
34     except OSError as exc: # Python >2.5
35         if exc.errno == errno.EEXIST:
36             pass
37         else: raise
38
39 def publish(src,target):
40     cmd = ["rsync","-a","--delete",src,target]
41     retcode = subprocess.call(cmd)
42     if retcode:
43         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
44
45
46 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
47           u'xi': u'http://www.w3.org/2001/XInclude',
48           u'xl': u'http://www.w3.org/1999/xlink'}
49
50 class Directory():
51     """Class containing the state of the directory with articles"""
52     def __init__(self):
53         self._cwd = '.'
54         self._tree = []
55
56     def scan(self):
57         for dirname, dirnames, filenames in os.walk(self._cwd):
58             for filename in filenames:
59                 if fnmatch.fnmatch(filename, '*.xml'):
60                     file_ = os.path.join(dirname,filename)
61                     doc = bindery.parse(file_, prefixes=PREFIXES)
62                     title = doc.xml_select(u'/db:article/db:info/db:title')
63                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
64                     if title and menu:
65                         base = file_.split('.')[1]
66                         link = base.replace('index','')
67                         self._tree.append(link)
68
69     def set(self):
70         return set(self._tree)
71
72 class Page():
73     """Class representing a webpage on the site"""
74     def __init__(self,link):
75         self._link = link
76         self._resources = []
77         self._script = 0
78
79     def link(self):
80         return self._link
81
82 class Node():
83     def __init__(self,token,value):
84         self._token = token
85         self._value = value
86         self._children = []
87
88     def token(self):
89         return self._token
90
91     def value(self):
92         return self._value
93
94     def children(self):
95         return self._children
96
97 class Trie():
98     def __init__(self):
99         self._root = []
100
101     def _add(self,trie, key, content):
102         # is the key a leaf
103         k = key.pop(0)
104         if key == []:
105             node = Node(k,content)
106             trie.append(node)
107         else:
108             for ch in trie:
109                 if ch.token() == k:
110                     self._add(ch.children(), key, content)
111
112     def add(self,key, content):
113         self._add(self._root, key, content)
114
115     def _graph(self, trie, G):
116         for l in trie:
117             G.add_node(l.token())
118             for ch in l.children():
119                 G.add_edge(l.token(),ch.token())
120                 self._graph(l.children(), G)
121
122     def graph(self):
123         G = pgv.AGraph(directed=True)
124         G.add_node("sitemap")
125         for ch in self._root:
126             G.add_edge("sitemap",ch.token())
127         self._graph(self._root, G)
128 #        G.layout('dot')
129 #        G.draw('g.png')
130         print G.string()
131
132 class Sitemap():
133     """Class keeping the internal site structure"""
134     def __init__(self):
135         self._file = 'sitemap.txt'
136         self._pages = []
137         self._tree = Trie()
138
139     def add_page(self, link):
140         page = Page(link)
141         self._pages.append(page)
142         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
143         self._tree.add(tokens,page)
144
145     def read_map(self):
146         try:
147             f = open(self._file)
148             sml = f.read().split()
149             f.close()
150             for line in sml:
151                 self.add_page(line)
152         except IOError, what_error:
153             print 'INFO: Could not read sitemap.txt - one will be created'
154
155     def set(self):
156         return set(page.link() for page in self._pages)
157
158     def pages(self):
159         return self._pages
160
161     def graph(self):
162         self._tree.graph()
163
164 def generateSitemap():
165     sitemap = []
166     try:
167         sfile = open('sitemap.txt')
168         flist = sfile.read().split()
169         sfile.close()
170         for f in flist:
171             sitemap.append(dict(link=f))
172     except IOError, what_error:
173         print 'Sitemap missing - generating one.'
174
175     for dirname, dirnames, filenames in os.walk('.'):
176         for filename in filenames:
177             if fnmatch.fnmatch(filename, '*.xml'):
178                 xfile = os.path.join(dirname,filename)
179                 doc = bindery.parse(xfile,
180                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
181                                               u'xi': u'http://www.w3.org/2001/XInclude',
182                                               u'xl': u'http://www.w3.org/1999/xlink'})
183                 title = doc.xml_select(u'/db:article/db:info/db:title')
184                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
185                 code  = doc.xml_select(u"//xi:include[@parse='text']")
186                 resource = doc.xml_select(u"//db:link[@xl:href]")
187                 image = doc.xml_select(u"//db:imagedata[@fileref]")
188                 exe = 0
189                 for c in code:
190                     (p, ext) = os.path.splitext(c.href)
191                     if ext in valid_scripts:
192                         exe = 1
193
194                 if title and menu:
195                     found = 0
196                     base = xfile.split('.')[1]
197                     link = base.replace('index','')
198                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
199                     res = []
200                     for r in resource:
201                         rf = os.path.join(dirname,r.href)
202                         if os.path.isfile(rf):
203                             res.append(rf)
204                     for i in image:
205                         im = os.path.join(dirname,i.fileref)
206                         if os.path.isfile(im):
207                             res.append(im)
208                     page = dict(title=unicode(doc.article.info.title),
209                                 menu=unicode(doc.article.info.titleabbrev),
210                                 output=os.path.join(dirname,
211                                                     filename.replace('xml','html')),
212                                 exe=exe,
213                                 file=xfile,
214                                 res=res,
215                                 level=level)
216                     for l in sitemap:
217                         if l['link'] == link:
218                             found = 1
219                             l.update(page)
220                     if not found:
221                         print "adding "+link+" to sitemap"
222                         dd = dict(link=link)
223                         dd.update(page)
224                         sitemap.append(dd)
225     sfile = open('sitemap.txt','w')
226     for l in sitemap:
227         sfile.write(l['link']+'\n')
228     sfile.close()
229     return sitemap
230
231 def expandXincludeTxt(page):
232     doc = bindery.parse(page['file'],
233                         prefixes={u'db': u'http://docbook.org/ns/docbook',
234                                   u'xi': u'http://www.w3.org/2001/XInclude'})
235     if page['exe']:
236         code  = doc.xml_select(u"//xi:include[@parse='text']")
237         for c in code:
238             (p, ext) = os.path.splitext(c.href)
239             if ext in valid_scripts:
240                 exe = os.path.join(os.path.abspath(c.href))
241                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
242                 xstr = bindery.parse(str(xml.stdout.read()))
243                 id = c.xml_index_on_parent
244                 for x in xstr.xml_children:
245                     c.xml_parent.xml_insert(id,x)
246                 c.xml_parent.xml_remove(c)
247     return doc
248
249 def xsltConvert(doc):
250 #  amara can not handle the docbook stylesheets
251 #  xmlarticle = transform(doc,style_xslt)
252     cwd = os.getcwd()
253     rundir = os.path.dirname(page['file'])
254     os.chdir(rundir)
255     infile  = os.path.basename(tempfile.mktemp())
256     outfile = tempfile.mktemp()
257     tfi = open(infile,'w')
258     tfi.write(doc.xml_encode())
259     tfi.close()
260 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
261     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
262     retcode = subprocess.call(cmd)
263     if retcode:
264         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
265     tfo = open(outfile,'r')
266     result = tfo.read()
267     tfo.close()
268     os.remove(infile)
269     os.remove(outfile)
270     os.chdir(cwd)
271     return result
272
273 def genMenu(page,sitemap,slevel,elevel):
274     title = None
275     sm = []
276     if elevel == MAXLEVEL or elevel == 1 or page == None:
277         html = '<ul>\n'
278         sm = sitemap
279     else:
280         html = '<ul class="tree">\n'
281         idx = sitemap.index(page)
282         while (sitemap[idx]['level'] == page['level']):
283             idx = idx-1
284         title = sitemap[idx]['menu']
285         idx = idx+1
286         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
287             sm.append(sitemap[idx])
288             idx = idx+1
289     oldlevel = slevel
290
291     for p in sm:
292         if slevel > p['level'] or elevel < p['level']:
293             continue
294         if not title and p['link'] == '/':
295             title = p['menu']
296
297         if oldlevel < p['level']:
298             html+='<ul>\n'
299         elif oldlevel > p['level']:
300             if p['link'][-1] == '/':
301                 html+='</li>\n'
302             html+='</ul>\n</li>\n'
303         if page != None and page == p:
304             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
305         else:
306             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
307         if p['link'][-1] != '/' or p['link'] == '/':
308             html+='</li>\n'
309         oldlevel = p['level']
310     html+='</ul>\n'
311     return (html,title)
312
313 def writeToTemplate(page,doc,sitemap):
314     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
315     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
316     template = Template(file=style_tmpl,
317                         searchList=[{'title':page['title']},
318                                     {'menu':menu},
319                                     {'article':doc},
320                                     {'levelmenu':levelmenu},
321                                     {'levelname':levelname}])
322     outfile = tmptarget+page['output']
323     mkdir_p(os.path.dirname(outfile))
324     out = open(outfile, 'w')
325     out.write(str(template))
326     out.close()
327     for r in page['res']:
328         mkdir_p(os.path.dirname(tmptarget+r))
329         shutil.copyfile(r, tmptarget+r)
330
331 def createSitemap(sitemap):
332     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
333     template = Template(file=style_tmpl,
334                         searchList=[
335             {'title':'Sitemap'},
336             {'menu':menu},
337             {'article':menu},
338             {'levelmenu':''},
339             {'levelname':''}])
340     outfile = tmptarget+'sitemap.en.html'
341     mkdir_p(os.path.dirname(outfile))
342     out = open(outfile, 'w')
343     out.write(str(template))
344     out.close()
345
346 dir_ = Directory()
347 sitemap = Sitemap()
348
349 dir_.scan()
350 sitemap.read_map()
351
352 missing = dir_.set() - sitemap.set()
353 removed = sitemap.set() - dir_.set()
354 for page in removed:
355     print removed+' pages missing!!'
356
357 for page in missing:
358     print 'adding missing page '+page
359     sitemap.add_page(page)
360
361 sitemap.graph()
362
363
364 sitemap = generateSitemap()
365 tmptarget = tempfile.mkdtemp()+'/'
366 for page in sitemap:
367     t1 = time.time()
368     print "Page : %-30s %30s" % (page['link'],
369                         time.ctime(os.stat(page['file']).st_mtime)),
370     doc = expandXincludeTxt(page)
371     pubdoc = xsltConvert(doc)
372     writeToTemplate(page,pubdoc,sitemap)
373     t2 = time.time()
374     print "[%5.2f s]" % (round(t2-t1,2))
375
376 createSitemap(sitemap)
377 publish(tmptarget, args.output)
378 publish(args.style+"css", args.output)
379 publish(args.style+"images",args.output)