Handle various other log formats in the pile

I hate myself now.
3 years ago · 85b72e8837
--- a/efnet-irclogger-convert.py
+++ b/efnet-irclogger-convert.py
@@ -9,48 +9,107 @@ date = filename.rsplit('/', 1)[-1][:10]

 with open(filename, 'rb') as fp:
 	for line in fp:
 		if line == b'\n':
 			continue
 		origLine = line
 		if not (line[0:1] == b'[' and line[3:4] == b':' and line[6:8] == b'] ' and line[-1:] == b'\n'):
 			print(f'MALFORMED LINE: {line!r}', file = sys.stderr)
 			print(f'GROSSLY MALFORMED LINE: {line!r}', file = sys.stderr)
 			continue
 		time = line[1:6].decode('ascii')
 		line = line[8:-1]
 		ts = datetime.datetime(int(date[:4]), int(date[5:7]), int(date[8:]), int(time[:2]), int(time[3:]), 0).replace(tzinfo = datetime.timezone.utc).timestamp()
 		if line.startswith(b'<'): #PRIVMSG
 		if line.startswith(b'<'): # PRIVMSG
 			sys.stdout.buffer.write(f'{ts} PRIVMSG '.encode('ascii') + line + b'\n')
 		elif line.startswith(b'* '): #ACTION
 		elif line.startswith(b'* '): # ACTION
 			sys.stdout.buffer.write(f'{ts} ACTION '.encode('ascii') + line[2:] + b'\n')
 		elif line.startswith(b'*** '):
 			words = line.split(b' ')[1:]
 			if words[1:3] == [b'has', b'joined']: # JOIN
 			# JOIN
 			if words[1:3] == [b'has', b'joined']:
 				if b'!' in words[0]: # irssi format
 					words[0] = words[0].split(b'!', 1)[0]
 				sys.stdout.buffer.write(f'{ts} JOIN '.encode('ascii') + words[0] + b' joins' + b'\n')
 			elif words[2:4] == [b'has', b'joined'] and words[1][0:1] == b'(' and words[1][-1:] == b')':
 				sys.stdout.buffer.write(f'{ts} JOIN '.encode('ascii') + words[0] + b' joins' + b'\n')
 			elif words[0] == b'Joins:': # JOIN
 			elif words[2:] == [b'joined', b'the', b'channel']: # (newsgrabber 2015-12-21)
 				sys.stdout.buffer.write(f'{ts} JOIN '.encode('ascii') + words[0] + b' joins' + b'\n')
 			elif words[0] == b'Joins:':
 				sys.stdout.buffer.write(f'{ts} JOIN '.encode('ascii') + words[1] + b' joins' + b'\n')
 			elif words[1:3] == [b'has', b'left']: # PART
 			# PART
 			elif words[1:3] == [b'has', b'left'] and ((b'!' in words[0] and words[4][0:1] == b'[' and words[-1][-1:] == b']') or b'!' not in words[0]):
 				if b'!' in words[0]: # irssi format
 					words[0] = words[0].split(b'!', 1)[0]
 					words[4] = words[4][1:]
 					words[-1] = words[-1][:-1]
 					words = words[0:3] + words[4:]
 				reason = (b' [' + b' '.join(words[3:]) + b']') if len(words) > 4 or words[3] != b'' else b''
 				sys.stdout.buffer.write(f'{ts} PART '.encode('ascii') + words[0] + b' leaves' + reason + b'\n')
 			elif words[0] == b'Parts:': # PART
 			elif words[2:4] == [b'has', b'left'] and words[1][0:1] == b'(' and words[1][-1:] == b')' and len(words) >= 5:
 				reason = (b' [' + b' '.join(words[5:])[1:-1] + b']') if len(words) > 6 or (len(words) == 6 and words[5] != b'()') else b''
 				sys.stdout.buffer.write(f'{ts} PART '.encode('ascii') + words[0] + b' leaves' + reason + b'\n')
 			elif words[2:] == [b'left', b'the', b'channel']: # (projectnewsletter 2015-04-01)
 				sys.stdout.buffer.write(f'{ts} PART '.encode('ascii') + words[0] + b' leaves' + b'\n')
 			elif words[0] == b'Parts:' and words[3][0:1] == b'(' and words[-1][-1:] == b')':
 				reason = (b' [' + b' '.join(words[3:])[1:-1] + b']') if len(words) > 4 or words[3] != b'()' else b''
 				sys.stdout.buffer.write(f'{ts} PART '.encode('ascii') + words[1] + b' leaves' + reason + b'\n')
 			elif words[1:4] == [b'has', b'quit', b'IRC']: # QUIT
 				reason = (b' [' + b' '.join(words[4:])[1:-1] + b']') if len(words) > 4 or words[4] != b'()' else b''
 			# QUIT
 			elif words[1:4] == [b'has', b'quit', b'IRC'] and words[4][0:1] == b'(' and words[-1][-1:] == b')':
 				reason = (b' [' + b' '.join(words[4:])[1:-1] + b']') if len(words) > 5 or words[4] != b'()' else b''
 				sys.stdout.buffer.write(f'{ts} QUIT '.encode('ascii') + words[0] + b' quits' + reason + b'\n')
 			elif words[1:3] == [b'has', b'quit'] and b'!' in words[0] and words[3][0:1] == b'[' and words[-1][-1:] == b']':
 				words[0] = words[0].split(b'!', 1)[0]
 				reason = (b' [' + b' '.join(words[3:])[1:-1] + b']') if len(words) > 4 or words[3] != b'[]' else b''
 				sys.stdout.buffer.write(f'{ts} QUIT '.encode('ascii') + words[0] + b' quits' + reason + b'\n')
 			elif words[1:3] == [b'has', b'quit'] and words[3][0:1] == b'(' and words[-1][-1:] == b')': # (archivebot 2015-10 and 2016-02/03)
 				reason = (b' [' + b' '.join(words[3:])[1:-1] + b']') if len(words) > 4 or words[3] != b'[]' else b''
 				sys.stdout.buffer.write(f'{ts} QUIT '.encode('ascii') + words[0] + b' quits' + reason + b'\n')
 			elif words[2:4] == [b'left', b'IRC'] and words[1][0:1] == b'(' and words[1][-1:] == b')' and (len(words) == 4 or (words[4][0:1] == b'(' and words[-1][-1:] == b')')): # (projectnewsletter 2015-03-27, 2015-03-23)
 				reason = (b' [' + b' '.join(words[4:])[1:-1] + b']') if len(words) > 5 or (len(words) == 5 and words[4] != b'()') else b''
 				sys.stdout.buffer.write(f'{ts} QUIT '.encode('ascii') + words[0] + b' quits' + reason + b'\n')
 			elif words[0] == b'Quits:': # QUIT
 			elif words[0] == b'Quits:' and words[3][0:1] == b'(' and words[-1][-1:] == b')':
 				reason = (b' [' + b' '.join(words[3:])[1:-1] + b']') if len(words) > 4 or words[3] != b'()' else b''
 				sys.stdout.buffer.write(f'{ts} QUIT '.encode('ascii') + words[0] + b' quits' + reason + b'\n')
 			elif words[1:4] == [b'was', b'kicked', b'by']: # KICK
 			# KICK
 			elif words[1:4] == [b'was', b'kicked', b'by'] and words[5][0:1] == b'(' and words[-1][-1:] == b')':
 				sys.stdout.buffer.write(f'{ts} KICK '.encode('ascii') + words[0] + b' is kicked by ' + words[4] + b' [' + b' '.join(words[5:])[1:-1] + b']' + b'\n')
 			elif words[1:3] == [b'sets', b'mode:']: # MODE
 				sys.stdout.buffer.write(f'{ts} MODE '.encode('ascii') + line[4:] + b'\n')
 			elif words[1:4] == [b'changes', b'topic', b'to:']: # TOPIC
 			elif words[1:5] == [b'has', b'been', b'kicked', b'by'] and b'!' in words[5] and words[6][0:1] == b'[' and words[-1][-1:] == b']':
 				sys.stdout.buffer.write(f'{ts} KICK '.encode('ascii') + words[0] + b' is kicked by ' + words[5].split(b'!', 1)[0] + b' [' + b' '.join(words[6:])[1:-1] + b']' + b'\n')
 			# MODE
 			elif words[1:3] == [b'sets', b'mode:'] or words[1:3] == [b'sets', b'mode']: # (newsgrabber 2015-12-20)
 				words[2] = b'mode:'
 				sys.stdout.buffer.write(f'{ts} MODE '.encode('ascii') + b' '.join(words) + b'\n')
 			elif words[0].startswith(b'mode/') and words[1][0:1] == b'[' and words[-3][-1:] == b']':
 				if b'!' in words[-1]: # Not always the case since the mode might be set by an IRCd after a netsplit
 					words[-1] = words[-1].split(b'!', 1)[0]
 				sys.stdout.buffer.write(f'{ts} MODE '.encode('ascii') + words[-1] + b' sets mode: ' + b' '.join(words[1:-2])[1:-1] + b'\n')
 			elif words[1:6] == [b'gives', b'channel', b'operator', b'status', b'to'] and len(words) >= 7: # (archivebot 2016-02, internetarchive.bak 2015-03-02)
 				sys.stdout.buffer.write(f'{ts} MODE '.encode('ascii') + words[0] + b' sets mode: +' + (b'o' * (len(words) - 6)) + b' ' + b' '.join(words[6:]) + b'\n')
 			elif words[1:4] == [b'gives', b'voice', b'to'] and len(words) == 5:
 				sys.stdout.buffer.write(f'{ts} MODE '.encode('ascii') + words[0] + b' sets mode: +v ' + words[-1] + b'\n')
 			elif words[1:4] == [b'sets', b'ban', b'on'] and len(words) == 5:
 				sys.stdout.buffer.write(f'{ts} MODE '.encode('ascii') + words[0] + b' sets mode: +b ' + words[-1] + b'\n')
 			# TOPIC
 			elif words[1:4] == [b'changes', b'topic', b'to:']:
 				sys.stdout.buffer.write(f'{ts} TOPIC '.encode('ascii') + words[0] + b' sets the topic to: ' + b' '.join(words[4:]) + b'\n')
 			elif words[1:4] == [b'changes', b'topic', b'to']: # TOPIC
 			elif words[1:4] == [b'changes', b'topic', b'to'] and words[4][0:1] == words[-1][-1:] == b"'":
 				sys.stdout.buffer.write(f'{ts} TOPIC '.encode('ascii') + words[0] + b' sets the topic to: ' + b' '.join(words[4:])[1:-1] + b'\n')
 			elif words[1:5] == [b'is', b'now', b'known', b'as']: # NICK
 				sys.stdout.buffer.write(f'{ts} NICK '.encode('ascii') + line[4:] + b'\n')
 			elif words[1:3] == [b'starts', b'logging']: # Silently ignore (there's already a JOIN)
 			elif words[1:4] == [b'changed', b'topic', b'of']:
 				sys.stdout.buffer.write(f'{ts} TOPIC '.encode('ascii') + words[0].split(b'!', 1)[0] + b' sets the topic to: ' + b' '.join(words[6:]) + b'\n')
 			elif words[1:6] == [b'has', b'changed', b'the', b'topic', b'to:']:
 				sys.stdout.buffer.write(f'{ts} TOPIC '.encode('ascii') + words[0] + b' sets the topic to: ' + b' '.join(words[6:]) + b'\n')
 			elif words[1:5] == [b'changed', b'the', b'topic', b'to']:
 				sys.stdout.buffer.write(f'{ts} TOPIC '.encode('ascii') + words[0] + b' sets the topic to: ' + b' '.join(words[5:]) + b'\n')
 			# NICK
 			elif words[1:5] == [b'is', b'now', b'known', b'as']:
 				if b'!' in words[0]:
 					words[0] = words[0].split(b'!', 1)[0]
 				sys.stdout.buffer.write(f'{ts} NICK '.encode('ascii') + words[0] + b' is now known as ' + b' '.join(words[5:]) + b'\n')
 			# Others
 			elif words[1:3] == [b'starts', b'logging']: # Silently ignore, there's already a JOIN (irclogger 2020 etc.)
 				pass
 			else:
 				print(f'MALFORMED LINE: {origLine!r}', file = sys.stderr)
 				sys.stdout.buffer.write(f'{ts} OLDLOGS_UNCLASSIFIED '.encode('ascii') + line + b'\n')
 		else:
 			print(f'MALFORMED LINE: {origLine!r}', file = sys.stderr)
 			sys.stdout.buffer.write(f'{ts} OLDLOGS_UNCLASSIFIED '.encode('ascii') + line + b'\n')