Published in: Python
Creates a specialised mirror Ubuntu for a single version, architecture and language - reducing the space required for the mirror from humongous to about 17 gigabytes. Such a local mirror is handy for Ubuntu machines that lack a nearby or cheap internet connection.
#!/usr/bin/python """ubuntu-mirror.py -- Make a personal mirror of a subset of Ubuntu or Debian Usage: ubuntu-mirror.py [options] First fetches Contents-arch.gz, Packages.bz2 and Release files, but only if they changed size. Then it parses Packages.bz2 files and fetches any non-excluded deb files which changed size (zero is a size :). It keeps a list of downloaded debs, and deletes any debs no longer listed on the remote side. Options and example values (just examples, not defaults!) --archs=i386,powerpc Comma-separated architectures to retrieve --base=ftp://ftp.leg.uct.ac.za:/pub/linux/ubuntu Base URL for the distribution (the directory should contain dists and pool). Only supports http:// and anonymous ftp:// urls. --components=main,restricted,universe,multiverse Comma-separated distribution components to retrieve. --dest=/mirror/somewhere Destination directory for the mirror (this option has a default: the current directory) --dists=feisty,feisty-security,feisty-updates Comma-separated distribution names to retrieve --help Print this help message and exit. --langs=en,ch,cz Comma-separated two-letter language endings to keep. Debs ending with other language codes are excluded from the mirror, as well as a regex to exclude kde/gnome language packs. If nothing is specified (no --langs option) all languages are kept. Any excluded packages remain in Packages.bz2, but will be 'missing' if apt tries to fetch the .deb. --license Print the license terms for ubuntu-mirror.py (GPL) and exit. --regenerate Regenerate the mirror listing (ignores all except --dest). Example for Medibuntu: > python ubuntu-mirror.py --dest=. \ --archs=i386,powerpc \ --base=http://medibuntu.sos-sts.com/repo \ --dists=feisty --components=free,non-free Example for ubuntu.mirror.ac.za: > python ubuntu-mirror.py --dest=. \ --langs=en --archs=powerpc \ --base=ftp://ubuntu.mirror.ac.za/ubuntu-archive \ --components=main,restricted,universe,multiverse \ --dists=feisty,feisty-security,feisty-updates If you edit the mirror regenerate the listing: > python --dest=. --regenerate Copyright: 2007 G raham P oulter """ from __future__ import with_statement __copyright__ = "2007 G raham P oulter" __author__ = "G raham P oulter" __license__ = """This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.""" from urllib import urlopen, quote from urlparse import urlparse, urljoin from contextlib import closing from getopt import getopt import bz2 import ftplib import gzip import os import re import sys import time language_list = ["-bg","-bn","-br","-bs","-by","-ca","-cn","-cs", "-cy","-da","-de","-dz","-el","-en","-eo","-er", "-es","-et","-fa","-fi","-fr","-ga","-gl","-he", "-hr","-hu","-in","-it","-ja","-ka","-kn","-ko", "-ku","-lo","-lt","-lv","-mk","-nb","-ne","-nl", "-nn","-nr","-ns","-pl","-pt","-ru","-rw","-sk", "-sl","-sr","-ss","-st","-sv","-sw","-tg","-th", "-tn","-tr","-ts","-tw","-uk","-ve","-vi","-xh", "-zu"] """Language endings so we can exclude packages based on language""" def log(msg, *argp): """Function for logging - may reimplement later""" print msg % argp ####################################################################### class UbuntuMirror: """Class to represent the status of an on-disk mirror of Ubuntu @note: We assume that the mirror is rooted at the current directory, and all paths are relative to the current directory. This class cannot be used with anything else that wants to change the current directory. @ivar listing: Relative path to the mirror listing, where each line is a tab-separated relative path and file size representing the mirror state. @ivar present: Mapping from file name to size in bytes for files in the listing. @ivar delete: If True, files are deleted from disk when removed from the mirror listing. """ def __init__(self, listing="listing", delete=True): """Initialise the mirror""" self.listing = listing self.present = {} self.delete = delete self._read_listing() def _read_listing(self): """Read filename->size mapping for files in the mirror""" log("Reading mirror listing") self.present = {} if os.path.exists(self.listing): with open(self.listing, "rb") as f: for line in f: name, size = line.split("\t",1) self.present[name] = int(size) def _backup_listing(self): """Move listing file to a backup file""" log("Backing up mirror listing") if os.path.exists(self.listing): os.rename(self.listing, self.listing+".bak") def write_listing(self): """Write the listing from memory to disk""" self._backup_listing() log("Writing mirror listing") with open(self.listing, "wb") as f: for name, size in sorted(self.present.iteritems()): f.write(name + "\t" + str(size) + "\n") def regenerate(self): """Regenerate the listing file by recursing the mirror""" self._backup_listing() log("Regenerating mirror listing recursively") self.present = {} with open(self.listing, "wb") as f: for toplevel in [ "dists", "pool" ]: for root, dirnames, fnames in os.walk(toplevel, topdown=True): dirnames.sort() fnames.sort() for fname in fnames: fpath = os.path.join(root, fname) size = os.stat(fpath).st_size fline = "%s\t%d\n" % (fpath, size) self.add(fpath, size) sys.stdout.write(fline) f.write(fline) def add(self, fpath, size=None): """Add relative file path and size to the mirror listing""" if size is None: size = os.stat(fpath).st_size self.present[fpath] = size def remove(self, fname): """Remove a file from the mirror (no-op if already removed)""" try: del self.present[fname] except KeyError: pass if self.delete: log("Deleting %s", fname) else: log("Pretending to delete %s", fname) return # Skip the actual deletion try: if os.path.exists(fname): os.remove(fname) try: os.removedirs(os.path.dirname(fname)) except OSError: pass # Don't care if rmdir failed except OSError: log("Failed to delete %s", fname) def prune(self, keep): """Delete files that are not members of keep""" for fname in sorted(self.present.iterkeys()): if fname not in keep: self.remove(fname) ####################################################################### def download_files(baseurl, flist, mirror): """Retrieve files via FTP or HTTP, but only fetch if size has changed and only gets the local or remote size if we do not already know it. Returns list of successful retrievals @param baseurl: URL to which the files in flist are relative @param flist: List (filename, size) of relative paths to retrieve (use None in second half when sizes are not known). @param mirror: L{UbuntuMirror} instance for current state of mirror. @return: List of (filename, size) pairs for downloaded files.""" if not baseurl.endswith("/"): baseurl += "/" proto, host, basedir = urlparse(baseurl)[:3] if proto not in ["ftp", "http"]: log("Error: Only ftp:// and http:// urls are supported") sys.exit(1) def get_file(fname, rsize): """Retrieve file via FTP or HTTP @param fname: Relative path to remote file @param rsize: Size of remote file, or None (to query the size) """ # Create directory if necessary if proto == "http": url = None # Only fetch if new size is different if fname in mirror.present: lsize = mirror.present[fname] try: if rsize is None: if proto == "ftp": rsize = int(ftp.size(fname)) elif proto == "http": url = urlopen(urljoin(baseurl, quote(fname))) rsize = int(url.info()['content-length']) except (IOError, ftplib.all_errors), e: log("Failed to get size of remote file %s (%s)", fname, str(e)) return False if lsize == rsize: log("Already retrieved %s", fname) return False else: log("Size has changed (%d to %d) for %s", lsize, rsize, fname) else: dname = os.path.dirname(fname) if not os.path.isdir(dname): os.makedirs(dname) # Fetch file log("Retrieving %s", fname) outfile = open(fname, "wb") t0 = time.time() try: sys.stdout.flush() if proto == "ftp": ftp.retrbinary("RETR %s" % fname, outfile.write) elif proto == "http": if url is None: url = urlopen(urljoin(baseurl, quote(fname))) rsize = int(url.info()['content-length']) outfile.write(url.read(rsize)) t1 = time.time() size = outfile.tell() except (IOError, ftplib.all_errors), e: log("Error: failed to retrieve %s (%s)" , fname, str(e)) return False finally: outfile.close() # Calculate speed speed = 0 if (t1-t0) > 0: speed = int((size/1000)/(t1-t0)) log("Wrote %d bytes in %.2f seconds (%d KB/s)", size, t1-t0, speed) # Check download size if rsize is not None and size != rsize: log("Error: retrieved %d bytes, but size should be %d", size, rsize) return False # All good, add the file to the mirror mirror.add(fname) return True # Log into the host if proto == "ftp": log("FTP logging in to %s to transfer %d files", host, len(flist)) ftp = ftplib.FTP(host) ftp.login() ftp.cwd(basedir) elif proto == "http": log("Beginning HTTP transfer of %d files from %s", len(flist), baseurl) # Loop over the files success = [] for fname, rsize in flist: if get_file(fname, rsize): success.append((fname, rsize)) if proto == "ftp": ftp.quit() return success def list_packages(archs, dists, components, packages_ext="bz2"): """Return a list of Release and Packages files for architectures in archs, distributions in dists and the given list of components. These files can be fetched using download_files, and the Packages.bz2 files can then be parsed with parse_packages. """ flist = [] def add(fname): flist.append((fname,None)) for arch in archs: for dist in dists: dist_dir = "dists/%(dist)s" % locals() add("%(dist_dir)s/Release" % locals()) add("%(dist_dir)s/Contents-%(arch)s.gz" % locals()) for component in components: arch_dir = "%(dist_dir)s/%(component)s/binary-%(arch)s" % locals() add("%(arch_dir)s/Packages.%(packages_ext)s" % locals()) add("%(arch_dir)s/Release" % locals()) return flist def parse_packages(packages_files): """Parse the list of pool/x/y/z.deb files and list of sizes. @param packages_files: A list of relative paths. Those whose basename is Packages.bz2 are parsed for 'Filename: ' and 'Size: ' lines. @return: List of debian packaages, and a list of their sizes. """ fnames, fsizes = [], [] fn_str = "Filename: " fn_len = len(fn_str) sz_str = "Size: " sz_len = len(sz_str) def parse_file(f): for line in f: # Trust "Filename:" to always have corresponding "Size:" if line.startswith(fn_str): fnames.append(line[fn_len:-1]) elif line.startswith(sz_str): fsizes.append(int(line[sz_len:-1])) for lname in packages_files: # Decide whether we're parsing .bz2 or .gz if os.path.basename(lname) == "Packages.bz2": log("Parsing: %s", lname) with closing(bz2.BZ2File(lname,"r")) as f: parse_file(f) elif os.path.basename(lname) == "Packages.gz": with closing(gzip.GzipFile(lname,"r")) as f: parse_file(f) return zip(fnames, fsizes) def exclude_debs(debfiles, langs, pkgregex): """Filter a list of debian packages using some exclusion rules @param debfiles: Pairs of pool/component/letter/package/name_version_arch.deb and file sizes. @param langs: Set of language codes of the form '-de' to exclude @param pkgregex: Regular expression - exclude matching package directories @return: Pairs of (name, size) for those that made it through the filter """ total_pkgs = 0 total_size = 0 for fname, fsize in debfiles: # Bits from pool/component/letter/package/name_version_arch.deb bits = fname.rsplit("/",4) langcode = bits[4].split("_",1)[0][-3:] pkgdir = bits[3] # Check last 3 letter of name against language if langcode in langs: log("Excluding %s (%s bytes) [langcode]", fname, fsize) total_pkgs += 1 total_size += fsize elif pkgregex.match(pkgdir): log("Excluding %s (%s bytes) [pkgregex]", fname, fsize) total_pkgs += 1 total_size += fsize else: yield (fname, fsize) log("Excluded %d packages, for %d bytes", total_pkgs, total_size) ####################################################################### def scriptmain(args): """Main function for the script, call with contents of sys.argv""" # Set up default options archs = [] baseurl = [] destdir = "." dists = [] components = [] exclude_file = "" langs = [] regenerate = False # Parse command options optlist, args = getopt(args[1:], "", ["archs=", "base=", "components=", "dest=", "dists=", "exclude=", "help", "langs=", "license", "regenerate"]) # Split on commas, but avoids [""] def commasplit(string): return [] if string == "" else string.split(",") for key, value in optlist: if key == "--archs": archs = commasplit(value) elif key == "--base": baseurl = value elif key == "--components": components = commasplit(value) elif key == "--dest": destdir = value elif key == "--dists": dists = commasplit(value) elif key == "--exclude": exclude_file = value elif key == "--help": print __doc__ sys.exit(0) elif key == "--langs": langs = commasplit(value) for lang in langs: if not lang.isalpha() or len(lang) != 2: log("Invalid language code '%s', exiting", lang) sys.exit(1) elif key == "--license": print license sys.exit(0) elif key == "--regenerate": regenerate = True # Change to destination directory and read mirror os.chdir(destdir) mirror = UbuntuMirror() # Perform regeneration instead of other actions if regenerate: mirror.regenerate() sys.exit(0) # Check that required arguments are provided if not archs: log("Please provide at least one architecture (--archs)!") sys.exit(1) if not baseurl: log("Please provide a base URL (--base)!") sys.exit(1) if not components: log("Please provide at least one component (--components)!") sys.exit(1) if not dists: log("Please provie at least one distribution (--dists)!") sys.exit(1) # Exclude language codes if langs: exclude_langs = set(language_list) - set(["-"+x for x in langs]) else: exclude_langs = set() # Exclude language packs if langs: exclude_lang_packs = re.compile( r"^language-pack(?:-kde|-gnome|)-(?!%s)[a-z][a-z](?:-base|)$" % "|".join(langs)) else: exclude_lang_packs = re.compile(r"^$") # Get the distribution packages files log("Creating list of required packages files") package_files = list_packages(archs, dists, components, packages_ext="bz2") log("\nDownloading required packages files") download_files(baseurl, package_files, mirror) # Parse names of current debfiles from the packages files log("\nParsing packages files for debs") debfiles = parse_packages(x[0] for x in package_files) # Exclusions based on package name or language ending log("\nFiltering debs for exclusions") debfiles = list(exclude_debs(debfiles, exclude_langs, exclude_lang_packs)) # Download the remaining .deb files log("\nDownloading filtered debs (only size has changed)") fetched = download_files(baseurl, debfiles, mirror) # Update the mirror log("\nRemoving files which are no longer on mirror") mirror.prune(set([x[0] for x in package_files] + [x[0] for x in debfiles])) mirror.write_listing() if __name__ == "__main__": scriptmain(sys.argv)
You need to login to post a comment.
