Code

issue2550729: Fix password history display for anydbm backend, thanks to
[roundup.git] / scripts / import_sf.py
1 """ Import tracker data from Sourceforge.NET
3 This script needs four steps to work:
5 1. Export the project XML data using the admin web interface at sf.net
6 2. Run the file fetching (these are not included in the XML):
8     import_sf.py files <path to XML> <path to files dir>
10    this will place all the downloaded files in the files dir by file id.
11 3. Convert the sf.net XML to Roundup "export" format:
13     import_sf.py import <tracker home> <path to XML> <path to files dir>
15    this will generate a directory "/tmp/imported" which contains the
16    data to be imported into a Roundup tracker.
17 4. Import the data:
19     roundup-admin -i <tracker home> import /tmp/imported
21 And you're done!
22 """
24 import sys, os, csv, time, urllib2, httplib, mimetypes, urlparse
25 # Python 2.3 ... 2.6 compatibility:
26 from roundup.anypy.sets_ import set
28 try:
29     import cElementTree as ElementTree
30 except ImportError:
31     from elementtree import ElementTree
33 from roundup import instance, hyperdb, date, support, password
35 today = date.Date('.')
37 DL_URL = 'http://sourceforge.net/tracker/download.php?group_id=%(group_id)s&atid=%(atid)s&aid=%(aid)s'
39 def get_url(aid):
40     """ so basically we have to jump through hoops, given an artifact id, to
41     figure what the URL should be to access that artifact, and hence any
42     attached files."""
43     # first we hit this URL...
44     conn = httplib.HTTPConnection("sourceforge.net")
45     conn.request("GET", "/support/tracker.php?aid=%s"%aid)
46     response = conn.getresponse()
47     # which should respond with a redirect to the correct url which has the
48     # magic "group_id" and "atid" values in it that we need
49     assert response.status == 302, 'response code was %s'%response.status
50     location = response.getheader('location')
51     query = urlparse.urlparse(response.getheader('location'))[-2]
52     info = dict([param.split('=') for param in query.split('&')])
53     return DL_URL%info
55 def fetch_files(xml_file, file_dir):
56     """ Fetch files referenced in the xml_file into the dir file_dir. """
57     root = ElementTree.parse(xml_file).getroot()
58     to_fetch = set()
59     deleted = set()
60     for artifact in root.find('artifacts'):
61         for field in artifact.findall('field'):
62             if field.get('name') == 'artifact_id':
63                 aid = field.text
64         for field in artifact.findall('field'):
65             if field.get('name') != 'artifact_history': continue
66             for event in field.findall('history'):
67                 d = {}
68                 for field in event.findall('field'):
69                     d[field.get('name')] = field.text
70                 if d['field_name'] == 'File Added':
71                     fid = d['old_value'].split(':')[0]
72                     to_fetch.add((aid, fid))
73                 if d['field_name'] == 'File Deleted':
74                     fid = d['old_value'].split(':')[0]
75                     deleted.add((aid, fid))
76     to_fetch = to_fetch - deleted
78     got = set(os.listdir(file_dir))
79     to_fetch = to_fetch - got
81     # load cached urls (sigh)
82     urls = {}
83     if os.path.exists(os.path.join(file_dir, 'urls.txt')):
84         for line in open(os.path.join(file_dir, 'urls.txt')):
85             aid, url = line.strip().split()
86             urls[aid] = url
88     for aid, fid in support.Progress('Fetching files', list(to_fetch)):
89         if fid in got: continue
90         if not urls.has_key(aid):
91             urls[aid] = get_url(aid)
92             f = open(os.path.join(file_dir, 'urls.txt'), 'a')
93             f.write('%s %s\n'%(aid, urls[aid]))
94             f.close()
95         url = urls[aid] + '&file_id=' + fid
96         f = urllib2.urlopen(url)
97         data = f.read()
98         n = open(os.path.join(file_dir, fid), 'w')
99         n.write(data)
100         f.close()
101         n.close()
103 def import_xml(tracker_home, xml_file, file_dir):
104     """ Generate Roundup tracker import files based on the tracker schema,
105     sf.net xml export and downloaded files from sf.net. """
106     tracker = instance.open(tracker_home)
107     db = tracker.open('admin')
109     resolved = db.status.lookup('resolved')
110     unread = db.status.lookup('unread')
111     chatting = db.status.lookup('unread')
112     critical = db.priority.lookup('critical')
113     urgent = db.priority.lookup('urgent')
114     bug = db.priority.lookup('bug')
115     feature = db.priority.lookup('feature')
116     wish = db.priority.lookup('wish')
117     adminuid = db.user.lookup('admin')
118     anonuid = db.user.lookup('anonymous')
120     root = ElementTree.parse(xml_file).getroot()
122     def to_date(ts):
123         return date.Date(time.gmtime(float(ts)))
125     # parse out the XML
126     artifacts = []
127     categories = set()
128     users = set()
129     add_files = set()
130     remove_files = set()
131     for artifact in root.find('artifacts'):
132         d = {}
133         op = {}
134         artifacts.append(d)
135         for field in artifact.findall('field'):
136             name = field.get('name')
137             if name == 'artifact_messages':
138                 for message in field.findall('message'):
139                     l = d.setdefault('messages', [])
140                     m = {}
141                     l.append(m)
142                     for field in message.findall('field'):
143                         name = field.get('name')
144                         if name == 'adddate':
145                             m[name] = to_date(field.text)
146                         else:
147                             m[name] = field.text
148                         if name == 'user_name': users.add(field.text)
149             elif name == 'artifact_history':
150                 for event in field.findall('history'):
151                     l = d.setdefault('history', [])
152                     e = {}
153                     l.append(e)
154                     for field in event.findall('field'):
155                         name = field.get('name')
156                         if name == 'entrydate':
157                             e[name] = to_date(field.text)
158                         else:
159                             e[name] = field.text
160                         if name == 'mod_by': users.add(field.text)
161                     if e['field_name'] == 'File Added':
162                         add_files.add(e['old_value'].split(':')[0])
163                     elif e['field_name'] == 'File Deleted':
164                         remove_files.add(e['old_value'].split(':')[0])
165             elif name == 'details':
166                 op['body'] = field.text
167             elif name == 'submitted_by':
168                 op['user_name'] = field.text
169                 d[name] = field.text
170                 users.add(field.text)
171             elif name == 'open_date':
172                 thedate = to_date(field.text)
173                 op['adddate'] = thedate
174                 d[name] = thedate
175             else:
176                 d[name] = field.text
178         categories.add(d['category'])
180         if op.has_key('body'):
181             l = d.setdefault('messages', [])
182             l.insert(0, op)
184     add_files -= remove_files
186     # create users
187     userd = {'nobody': '2'}
188     users.remove('nobody')
189     data = [
190         {'id': '1', 'username': 'admin', 'password': password.Password('admin'),
191             'roles': 'Admin', 'address': 'richard@python.org'},
192         {'id': '2', 'username': 'anonymous', 'roles': 'Anonymous'},
193     ]
194     for n, user in enumerate(list(users)):
195         userd[user] = n+3
196         data.append({'id': str(n+3), 'username': user, 'roles': 'User',
197             'address': '%s@users.sourceforge.net'%user})
198     write_csv(db.user, data)
199     users=userd
201     # create categories
202     categoryd = {'None': None}
203     categories.remove('None')
204     data = []
205     for n, category in enumerate(list(categories)):
206         categoryd[category] = n
207         data.append({'id': str(n), 'name': category})
208     write_csv(db.keyword, data)
209     categories = categoryd
211     # create issues
212     issue_data = []
213     file_data = []
214     message_data = []
215     issue_journal = []
216     message_id = 0
217     for artifact in artifacts:
218         d = {}
219         d['id'] = artifact['artifact_id']
220         d['title'] = artifact['summary']
221         d['assignedto'] = users[artifact['assigned_to']]
222         if d['assignedto'] == '2':
223             d['assignedto'] = None
224         d['creation'] = artifact['open_date']
225         activity = artifact['open_date']
226         d['creator'] = users[artifact['submitted_by']]
227         actor = d['creator']
228         if categories[artifact['category']]:
229             d['keyword'] = [categories[artifact['category']]]
230         issue_journal.append((
231             d['id'], d['creation'].get_tuple(), d['creator'], "'create'", {}
232         ))
234         p = int(artifact['priority'])
235         if artifact['artifact_type'] == 'Feature Requests':
236             if p > 3:
237                 d['priority'] = feature
238             else:
239                 d['priority'] = wish
240         else:
241             if p > 7:
242                 d['priority'] = critical
243             elif p > 5:
244                 d['priority'] = urgent
245             elif p > 3:
246                 d['priority'] = bug
247             else:
248                 d['priority'] = feature
250         s = artifact['status']
251         if s == 'Closed':
252             d['status'] = resolved
253         elif s == 'Deleted':
254             d['status'] = resolved
255             d['is retired'] = True
256         else:
257             d['status'] = unread
259         nosy = set()
260         for message in artifact.get('messages', []):
261             authid = users[message['user_name']]
262             if not message['body']: continue
263             body = convert_message(message['body'], message_id)
264             if not body: continue
265             m = {'content': body, 'author': authid,
266                 'date': message['adddate'],
267                 'creation': message['adddate'], }
268             message_data.append(m)
269             if authid not in (None, '2'):
270                 nosy.add(authid)
271             activity = message['adddate']
272             actor = authid
273             if d['status'] == unread:
274                 d['status'] = chatting
276         # add import message
277         m = {'content': 'IMPORT FROM SOURCEFORGE', 'author': '1',
278             'date': today, 'creation': today}
279         message_data.append(m)
281         # sort messages and assign ids
282         d['messages'] = []
283         message_data.sort(lambda a,b:cmp(a['date'],b['date']))
284         for message in message_data:
285             message_id += 1
286             message['id'] = str(message_id)
287             d['messages'].append(message_id)
289         d['nosy'] = list(nosy)
291         files = []
292         for event in artifact.get('history', []):
293             if event['field_name'] == 'File Added':
294                 fid, name = event['old_value'].split(':', 1)
295                 if fid in add_files:
296                     files.append(fid)
297                     name = name.strip()
298                     try:
299                         f = open(os.path.join(file_dir, fid))
300                         content = f.read()
301                         f.close()
302                     except:
303                         content = 'content missing'
304                     file_data.append({
305                         'id': fid,
306                         'creation': event['entrydate'],
307                         'creator': users[event['mod_by']],
308                         'name': name,
309                         'type': mimetypes.guess_type(name)[0],
310                         'content': content,
311                     })
312                 continue
313             elif event['field_name'] == 'close_date':
314                 action = "'set'"
315                 info = { 'status': unread }
316             elif event['field_name'] == 'summary':
317                 action = "'set'"
318                 info = { 'title': event['old_value'] }
319             else:
320                 # not an interesting / translatable event
321                 continue
322             row = [ d['id'], event['entrydate'].get_tuple(),
323                 users[event['mod_by']], action, info ]
324             if event['entrydate'] > activity:
325                 activity = event['entrydate']
326             issue_journal.append(row)
327         d['files'] = files
329         d['activity'] = activity
330         d['actor'] = actor
331         issue_data.append(d)
333     write_csv(db.issue, issue_data)
334     write_csv(db.msg, message_data)
335     write_csv(db.file, file_data)
337     f = open('/tmp/imported/issue-journals.csv', 'w')
338     writer = csv.writer(f, colon_separated)
339     writer.writerows(issue_journal)
340     f.close()
342 def convert_message(content, id):
343     """ Strip off the useless sf message header crap """
344     if content[:14] == 'Logged In: YES':
345         return '\n'.join(content.splitlines()[3:]).strip()
346     return content
348 class colon_separated(csv.excel):
349     delimiter = ':'
351 def write_csv(klass, data):
352     props = klass.getprops()
353     if not os.path.exists('/tmp/imported'):
354         os.mkdir('/tmp/imported')
355     f = open('/tmp/imported/%s.csv'%klass.classname, 'w')
356     writer = csv.writer(f, colon_separated)
357     propnames = klass.export_propnames()
358     propnames.append('is retired')
359     writer.writerow(propnames)
360     for entry in data:
361         row = []
362         for name in propnames:
363             if name == 'is retired':
364                 continue
365             prop = props[name]
366             if entry.has_key(name):
367                 if isinstance(prop, hyperdb.Date) or \
368                         isinstance(prop, hyperdb.Interval):
369                     row.append(repr(entry[name].get_tuple()))
370                 elif isinstance(prop, hyperdb.Password):
371                     row.append(repr(str(entry[name])))
372                 else:
373                     row.append(repr(entry[name]))
374             elif isinstance(prop, hyperdb.Multilink):
375                 row.append('[]')
376             elif name in ('creator', 'actor'):
377                 row.append("'1'")
378             elif name in ('created', 'activity'):
379                 row.append(repr(today.get_tuple()))
380             else:
381                 row.append('None')
382         row.append(entry.get('is retired', False))
383         writer.writerow(row)
385         if isinstance(klass, hyperdb.FileClass) and entry.get('content'):
386             fname = klass.exportFilename('/tmp/imported/', entry['id'])
387             support.ensureParentsExist(fname)
388             c = open(fname, 'w')
389             if isinstance(entry['content'], unicode):
390                 c.write(entry['content'].encode('utf8'))
391             else:
392                 c.write(entry['content'])
393             c.close()
395     f.close()
396     f = open('/tmp/imported/%s-journals.csv'%klass.classname, 'w')
397     f.close()
399 if __name__ == '__main__':
400     if sys.argv[1] == 'import':
401         import_xml(*sys.argv[2:])
402     elif sys.argv[1] == 'files':
403         fetch_files(*sys.argv[2:])