Return to Snippet

Revision: 7918
at August 20, 2008 22:36 by denilw


Initial Code
#!/usr/bin/env python

"""IMAP Incremental Backup Script"""
__version__ = "1.2e" # Not likely to change soon
__author__ = "Rui Carmo (http://the.taoofmac.com)"
__copyright__ = "(C) 2006 Rui Carmo. Code under BSD License."
__contributors__ = "Bob Ippolito (fix for http://python.org/sf/1092502)"

# THIS IS BETA SOFTWARE - USE AT YOUR OWN RISK, I TAKE NO RESPONSIBILITY FOR ANY DATA LOSS
# See http://the.taoofmac.com/space/Projects/imapbackup.py for more information.

import getpass, os, gc, sys, time, platform, getopt
import mailbox, rfc822, imaplib, socket, email
import StringIO, re, csv, sha, gzip, bz2

# Progress spinner 
spinner_pos = 0
spinner=[c.encode("utf-8") for c in unicode("|/-\\","utf-8")]

def spin(i):
  """Display a cheesy spinner"""
  global spinner_pos
  if sys.stdin.isatty():
    sys.stdout.write("\r" + spinner[spinner_pos])
    sys.stdout.flush()
    spinner_pos+=1
    spinner_pos%=len(spinner)

def clean_exit():
  sys.stdout.write("\n")

def cli_exception(type, value, tb):
  if not issubclass(type, KeyboardInterrupt):
    sys.__excepthook__(type, value, tb)
  else:
    clean_exit()

# Make sure we get a chance to clean up the display on a tty
if sys.stdin.isatty():
  sys.excepthook=cli_exception

# Helper class for IMAP folder list parsing
class mailboxlist(csv.excel):
  """This class is a csv dialect for parsing the IMAP folder list"""
  delimiter = ' '

# Hideous fix to counteract http://python.org/sf/1092502
# (which should have been fixed ages ago.)
def _fixed_socket_read(self, size=-1):
  data = self._rbuf
  if size < 0:
      # Read until EOF
      buffers = []
      if data:
          buffers.append(data)
      self._rbuf = ""
      if self._rbufsize <= 1:
          recv_size = self.default_bufsize
      else:
          recv_size = self._rbufsize
      while True:
          data = self._sock.recv(recv_size)
          if not data:
              break
          buffers.append(data)
      return "".join(buffers)
  else:
      # Read until size bytes or EOF seen, whichever comes first
      buf_len = len(data)
      if buf_len >= size:
          self._rbuf = data[size:]
          return data[:size]
      buffers = []
      if data:
          buffers.append(data)
      self._rbuf = ""
      while True:
          left = size - buf_len
          recv_size = min(self._rbufsize, left) # the actual fix
          data = self._sock.recv(recv_size)
          if not data:
              break
          buffers.append(data)
          n = len(data)
          if n >= left:
              self._rbuf = data[left:]
              buffers[-1] = data[:left]
              break
          buf_len += n
      return "".join(buffers)

# Platform detection to enable socket patch
# (issue may be present in other Pythons, but of this combination I'm sure of)
if('Darwin' in platform.platform() and '2.3.5' == platform.python_version()):
  socket._fileobject.read = _fixed_socket_read

# Regular expressions for parsing
msgmatch = re.compile("^Message\-Id\: (.+)", re.IGNORECASE + re.MULTILINE)
filematch = re.compile("(.+)", re.MULTILINE)
blanks = re.compile(r'\s+', re.MULTILINE)
msgsize = re.compile("\d+ \(RFC822.SIZE (\d+).*\)")

# Constants
IMAP_PATH_SEPARATOR='/' # May be different, depending on IMAP server
UUID = '19AF1258-1AAF-44EF-9D9A-731079D6FAD7' # Used to generate Message-Ids

def collectFromIMAP(server, imap_folder):
  """Collects Message-Ids from a given IMAP folder"""
  server.select(imap_folder)
  sys.stdout.write("  IMAP: Scanning %s" % imap_folder)
  # List all messages
  typ, data = server.search(None, 'ALL')
  messages = {}
  i = 0
  for num in data[0].split():
    # Retrieve each individual Message-Id
    typ, data = server.fetch(num, '(BODY[HEADER.FIELDS (MESSAGE-ID)])')
    header = data[0][1].strip()
    # remove newlines inside Message-Id (a dumb Exchange trait)
    header = blanks.sub(' ', header)
    try:
      id = msgmatch.match(header).group(1) 
      if id not in messages.keys():
        # avoid adding dupes
        messages[id] = num
    except:
      # Some messages may have no Message-Id, so we'll synthesise one
      # (this usually happens with Sent, Drafts and .Mac news)
      typ, data = server.fetch(num, '(BODY[HEADER.FIELDS (FROM TO CC DATE SUBJECT)])')
      header = data[0][1].strip()
      header = header.replace('
','\t')
      messages['<' + UUID + '.' + sha.sha(header).hexdigest() + '>'] = num      
      pass
    i = i + 1
    spin(i)
  sys.stdout.write("\n  IMAP: Found %d unique messages in %s.\n" % (len(messages.keys()),imap_folder))
  return messages

def collectFromFile(filename, compress):
  """Collects Message-Ids from a given mbox file"""
  # Most of this code is deprecated in Python > 2.3, since PortableUnixMailbox is no more
  messages = {}
  i = 0
  if os.path.exists(filename):
    sys.stdout.write("  FILE: Scanning %s" % filename)
    if compress == 'gzip':
      handle = gzip.GzipFile(filename,'rb')
    elif compress == 'bzip2':
        handle = bz2.BZ2File(filename,'rb')
    else:
      handle = file(filename,'rb')
    for message in mailbox.PortableUnixMailbox(handle):
      header = ''
      # We assume all messages on disk have message-ids
      try:
        header =  ''.join(message.getfirstmatchingheader('message-id'))
      except KeyError:
        # No message ID was found. Warn the user and move on
        sys.stdout.write("\n  WARNING: Message #%d on %s does not have Message-Id header: %s." % (i, filename, str(message.getfirstmatchingheader('message-id'))))
        pass
      header = blanks.sub(' ', header.strip())
      try:
        id = msgmatch.match(header).group(1)
        if id not in messages.keys():
          # avoid adding dupes
          messages[id] = id
      except AttributeError:
        # Message-Id was found but could somehow not be parsed by regexp (highly bloody unlikely)
        sys.stdout.write("\n  WARNING: Mailbox file seems not to have been generated by this program.")
        sys.stdout.write("\n           Message-Id scanning turned up '%s'" % header)
        pass
      i = i + 1
      spin(i)
    handle.close()
  sys.stdout.write("\n  FILE: Found %d unique messages in %s.\n" % (len(messages.keys()),filename))
  return messages

def updateMailbox(server, imap_folder, mailbox, messages, existing, compress, clobber):
  """Append messages from IMAP folder to existing mailbox"""
  server.select(imap_folder)
  # Check if server supports PEEK
  # (bit redundant to do it every time, I know...)
  fetch_command = "(RFC822.PEEK)"
  response = server.fetch("1:1", fetch_command)
  if response[0] != "OK":
    fetch_command = "RFC822"
  else:
    fetch_command = "RFC822.PEEK"
  i = 0
  maxlength = total = 0
  if clobber == True:
    sys.stdout.write('  COPY: Copying from %s to %s' % (imap_folder, mailbox))
  else:
    sys.stdout.write('  APPEND: Appending from %s to %s' % (imap_folder, mailbox))
  # Open disk file
  if compress == 'gzip':
    mbx = gzip.GzipFile(mailbox,'ab',9)
  elif compress == 'bzip2':
    mbx = bz2.BZ2File(mailbox,'wb',512*1024,9)
  else:
    mbx = file(mailbox,'ab')
  for id in messages.keys():
    # If IMAP message is not in mbox file
    if id not in existing.keys():
      # Get raw message size
      typ, data = server.fetch(messages[id], '(RFC822.SIZE)')
      length = int(msgsize.match(data[0]).group(1))
      maxlength = max(length, maxlength)
      total = total + length
      # This "From" and the terminating newline below delimit messages in mbox files
      buffer = "From nobody %s\n" % time.strftime('%a %m %d %H:%M:%S %Y') 
      # If this is one of our synthesised Message-Ids, insert it before the other headers
      if UUID in id:
        buffer = buffer + "Message-Id: %s\n" % id
      mbx.write(buffer)
      buffer = ''
      typ, data = server.fetch(messages[id], fetch_command)
      mbx.write(data[0][1].strip().replace('\r',''))
      del data
      gc.collect()
      mbx.write('\n\n')
      i = i + 1
      spin(i)
  mbx.close()
  if i == 0:
    sys.stdout.write('\n  INFO: No new messages.\n')
  else:
    sys.stdout.write('\n  SUMMARY: Appended %d messages to %s\n  (%d bytes, of which the largest message was %d bytes)\n' % (i, mailbox, total, maxlength))

def scanTree(server, compress, clobber):
  """Parse folder listing and loop over it"""
  # Obtain folder listing
  typ, data = server.list(pattern='*')
  # Parse folder listing as a CSV dialect (automatically removes quotes)
  reader = csv.reader(StringIO.StringIO('
'.join(data)),dialect='mailboxlist')
  # Iterate over each folder
  for row in reader:
    imap_folder = row[2]
    # generate a pathname for the mailbox file
    # (we assume that folders can contain messages, so we store messages in a '.mbox' file
    # inside a pathname generated from the IMAP mailbox name)
    path = '/'.join(imap_folder.split(IMAP_PATH_SEPARATOR))
    filename = '.'.join(imap_folder.split(IMAP_PATH_SEPARATOR)) + '.mbox'
    if compress == 'gzip':
      filename = filename + '.gz'
    elif compress == 'bzip2':
      filename = filename + '.bz2'
      
    existing = {}
    # Collect Message-Ids from each folder
    messages = collectFromIMAP(server, imap_folder)
    if os.path.exists(filename):
      if clobber == True:
        os.remove(filename)
      elif compress != 'bzip2':
        # Collect pre-existing Message-Ids from disk file
        existing = collectFromFile(filename, compress)
    # now copy messages across
    updateMailbox(server, imap_folder, filename, messages, existing, compress, clobber)

def main():
  """Main entry point"""
  try:
    opts, args = getopt.getopt(sys.argv[1:], "z:s:u:p:y", ["compress=","server=", "username=","password=","yes-i-want-to-clobber-files"])
  except getopt.GetoptError:
    print "Usage: imapbackup [OPTIONS]"
    print "-y --yes-i-want-to-clobber-files does not try to append, or warn about bzip2 clobbering"
    print "-z (gzip|bzip2) --compress=(gzip|bzip2) create/append to compressed files (EXPERIMENTAL)"
    print "   WARNING: bzip2 does not allow for appending, existing files will be clobbered."
    print "-s HOSTNAME --server=HOSTNAME    connect to HOSTNAME"
    print "-u USERNAME --username=USERNAME  with USERNAME"
    print "-p PASSWORD --password=PASSWORD  with PASSWORD (you will be prompted for one if missing)"
    print "\nMailbox files will be created IN THE CURRENT WORKING DIRECTORY"
    sys.exit(2)
  username = password = server = None
  clobber = False
  compress = 'plain'
  for option, value in opts:
    if option in ("-y", "--yes-i-want-to-clobber-files"):
      print "WARNING: All existing mailbox files will be overwritten!"
      clobber = True
    if option in ("-z", "--compress"):
      if value in ('gzip','bzip2'):
        compress = value
      else:
        print "ERROR: Invalid compression type specified."
        sys.exit(2)
    if option in ("-s", "--server"):
      server = value
    if option in ("-u", "--username"):
      username = value
    if option in ("-p", "--password"):
      password = value
  if compress == 'bzip2' and clobber == False:
    print "ERROR: bzip2 compression does not allow for appending."
    print"        Please specify -y with it if you want to remove existing archives."
    sys.exit(2)
  elif compress == 'gzip' and clobber == False:
    print "WARNING: Appending will work, but .mbox.gz scanning is VERY slow."
    print "         You may want to consider using uncompressed files and"
    print "         running gzip -9 *.mbox after the backup run."
  if(server is None):
    print "ERROR: No server specified."
    sys.exit(2)
  if(username is None):
    print "ERROR: No username specified."
    sys.exit(2)
  if(password is None):
    password = getpass.getpass()
  server = imaplib.IMAP4(server)
  server.login(username, password)
  scanTree(server, compress, clobber)
  server.logout()

if __name__ == '__main__':
  csv.register_dialect('mailboxlist',mailboxlist)
  gc.enable()
  main()

Initial URL
http://the.taoofmac.com/media/Projects/imapbackup/imapbackup.py.txt

Initial Description


Initial Title
IMAP Backup Script

Initial Tags
python

Initial Language
Python