Posted By

Twain on 10/28/07


Tagged

python ubuntu mirror


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

copyleft


Ubuntu Mirror


 / Published in: Python
 

Creates a specialised mirror Ubuntu for a single version, architecture and language - reducing the space required for the mirror from humongous to about 17 gigabytes. Such a local mirror is handy for Ubuntu machines that lack a nearby or cheap internet connection.

  1. #!/usr/bin/python
  2.  
  3. """ubuntu-mirror.py -- Make a personal mirror of a subset of Ubuntu or Debian
  4.  
  5. Usage: ubuntu-mirror.py [options]
  6.  
  7. First fetches Contents-arch.gz, Packages.bz2 and Release files, but
  8. only if they changed size. Then it parses Packages.bz2 files and fetches
  9. any non-excluded deb files which changed size (zero is a size :).
  10. It keeps a list of downloaded debs, and deletes any debs no longer
  11. listed on the remote side.
  12.  
  13. Options and example values (just examples, not defaults!)
  14.  
  15. --archs=i386,powerpc
  16. Comma-separated architectures to retrieve
  17.  
  18. --base=ftp://ftp.leg.uct.ac.za:/pub/linux/ubuntu
  19. Base URL for the distribution (the directory should contain dists
  20. and pool). Only supports http:// and anonymous ftp:// urls.
  21.  
  22. --components=main,restricted,universe,multiverse
  23. Comma-separated distribution components to retrieve.
  24.  
  25. --dest=/mirror/somewhere
  26. Destination directory for the mirror (this option has a default:
  27. the current directory)
  28.  
  29. --dists=feisty,feisty-security,feisty-updates
  30. Comma-separated distribution names to retrieve
  31.  
  32. --help
  33. Print this help message and exit.
  34.  
  35. --langs=en,ch,cz
  36. Comma-separated two-letter language endings to keep. Debs ending
  37. with other language codes are excluded from the mirror, as well as a
  38. regex to exclude kde/gnome language packs. If nothing is specified
  39. (no --langs option) all languages are kept. Any excluded packages
  40. remain in Packages.bz2, but will be 'missing' if apt tries to fetch
  41. the .deb.
  42.  
  43. --license
  44. Print the license terms for ubuntu-mirror.py (GPL) and exit.
  45.  
  46. --regenerate
  47. Regenerate the mirror listing (ignores all except --dest).
  48.  
  49. Example for Medibuntu:
  50.  
  51. > python ubuntu-mirror.py --dest=. \
  52. --archs=i386,powerpc \
  53. --base=http://medibuntu.sos-sts.com/repo \
  54. --dists=feisty --components=free,non-free
  55.  
  56. Example for ubuntu.mirror.ac.za:
  57.  
  58. > python ubuntu-mirror.py --dest=. \
  59. --langs=en --archs=powerpc \
  60. --base=ftp://ubuntu.mirror.ac.za/ubuntu-archive \
  61. --components=main,restricted,universe,multiverse \
  62. --dists=feisty,feisty-security,feisty-updates
  63.  
  64. If you edit the mirror regenerate the listing:
  65.  
  66. > python --dest=. --regenerate
  67.  
  68. Copyright: 2007 G raham P oulter
  69. """
  70.  
  71. from __future__ import with_statement
  72.  
  73. __copyright__ = "2007 G raham P oulter"
  74. __author__ = "G raham P oulter"
  75. __license__ = """This program is free software: you can redistribute it and/or
  76. modify it under the terms of the GNU General Public License as published by the
  77. Free Software Foundation, either version 3 of the License, or (at your option)
  78. any later version.
  79.  
  80. This program is distributed in the hope that it will be useful, but WITHOUT ANY
  81. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  82. PARTICULAR PURPOSE. See the GNU General Public License for more details.
  83.  
  84. You should have received a copy of the GNU General Public License along with
  85. this program. If not, see <http://www.gnu.org/licenses/>."""
  86.  
  87. from urllib import urlopen, quote
  88. from urlparse import urlparse, urljoin
  89. from contextlib import closing
  90. from getopt import getopt
  91.  
  92. import bz2
  93. import ftplib
  94. import gzip
  95. import os
  96. import re
  97. import sys
  98. import time
  99.  
  100.  
  101. language_list = ["-bg","-bn","-br","-bs","-by","-ca","-cn","-cs",
  102. "-cy","-da","-de","-dz","-el","-en","-eo","-er",
  103. "-es","-et","-fa","-fi","-fr","-ga","-gl","-he",
  104. "-hr","-hu","-in","-it","-ja","-ka","-kn","-ko",
  105. "-ku","-lo","-lt","-lv","-mk","-nb","-ne","-nl",
  106. "-nn","-nr","-ns","-pl","-pt","-ru","-rw","-sk",
  107. "-sl","-sr","-ss","-st","-sv","-sw","-tg","-th",
  108. "-tn","-tr","-ts","-tw","-uk","-ve","-vi","-xh",
  109. "-zu"]
  110. """Language endings so we can exclude packages based on language"""
  111.  
  112.  
  113. def log(msg, *argp):
  114. """Function for logging - may reimplement later"""
  115. print msg % argp
  116.  
  117.  
  118. #######################################################################
  119.  
  120. class UbuntuMirror:
  121. """Class to represent the status of an on-disk mirror of Ubuntu
  122.  
  123. @note: We assume that the mirror is rooted at the current directory, and
  124. all paths are relative to the current directory. This class cannot be used
  125. with anything else that wants to change the current directory.
  126.  
  127. @ivar listing: Relative path to the mirror listing, where each line is
  128. a tab-separated relative path and file size representing the mirror state.
  129.  
  130. @ivar present: Mapping from file name to size in bytes for files in the
  131. listing.
  132.  
  133. @ivar delete: If True, files are deleted from disk when removed from
  134. the mirror listing.
  135. """
  136.  
  137. def __init__(self, listing="listing", delete=True):
  138. """Initialise the mirror"""
  139. self.listing = listing
  140. self.present = {}
  141. self.delete = delete
  142. self._read_listing()
  143.  
  144.  
  145. def _read_listing(self):
  146. """Read filename->size mapping for files in the mirror"""
  147. log("Reading mirror listing")
  148. self.present = {}
  149. if os.path.exists(self.listing):
  150. with open(self.listing, "rb") as f:
  151. for line in f:
  152. name, size = line.split("\t",1)
  153. self.present[name] = int(size)
  154.  
  155.  
  156. def _backup_listing(self):
  157. """Move listing file to a backup file"""
  158. log("Backing up mirror listing")
  159. if os.path.exists(self.listing):
  160. os.rename(self.listing, self.listing+".bak")
  161.  
  162.  
  163. def write_listing(self):
  164. """Write the listing from memory to disk"""
  165. self._backup_listing()
  166. log("Writing mirror listing")
  167. with open(self.listing, "wb") as f:
  168. for name, size in sorted(self.present.iteritems()):
  169. f.write(name + "\t" + str(size) + "\n")
  170.  
  171.  
  172. def regenerate(self):
  173. """Regenerate the listing file by recursing the mirror"""
  174. self._backup_listing()
  175. log("Regenerating mirror listing recursively")
  176. self.present = {}
  177. with open(self.listing, "wb") as f:
  178. for toplevel in [ "dists", "pool" ]:
  179. for root, dirnames, fnames in os.walk(toplevel, topdown=True):
  180. dirnames.sort()
  181. fnames.sort()
  182. for fname in fnames:
  183. fpath = os.path.join(root, fname)
  184. size = os.stat(fpath).st_size
  185. fline = "%s\t%d\n" % (fpath, size)
  186. self.add(fpath, size)
  187. sys.stdout.write(fline)
  188. f.write(fline)
  189.  
  190.  
  191. def add(self, fpath, size=None):
  192. """Add relative file path and size to the mirror listing"""
  193. if size is None:
  194. size = os.stat(fpath).st_size
  195. self.present[fpath] = size
  196.  
  197.  
  198. def remove(self, fname):
  199. """Remove a file from the mirror (no-op if already removed)"""
  200. try:
  201. del self.present[fname]
  202. except KeyError:
  203. pass
  204. if self.delete:
  205. log("Deleting %s", fname)
  206. else:
  207. log("Pretending to delete %s", fname)
  208. return # Skip the actual deletion
  209. try:
  210. if os.path.exists(fname):
  211. os.remove(fname)
  212. try:
  213. os.removedirs(os.path.dirname(fname))
  214. except OSError:
  215. pass # Don't care if rmdir failed
  216. except OSError:
  217. log("Failed to delete %s", fname)
  218.  
  219.  
  220. def prune(self, keep):
  221. """Delete files that are not members of keep"""
  222. for fname in sorted(self.present.iterkeys()):
  223. if fname not in keep:
  224. self.remove(fname)
  225.  
  226.  
  227. #######################################################################
  228.  
  229.  
  230. def download_files(baseurl, flist, mirror):
  231. """Retrieve files via FTP or HTTP, but only fetch if size has changed and
  232. only gets the local or remote size if we do not already know it. Returns
  233. list of successful retrievals
  234.  
  235. @param baseurl: URL to which the files in flist are relative
  236.  
  237. @param flist: List (filename, size) of relative paths to retrieve
  238. (use None in second half when sizes are not known).
  239.  
  240. @param mirror: L{UbuntuMirror} instance for current state of mirror.
  241.  
  242. @return: List of (filename, size) pairs for downloaded files."""
  243. if not baseurl.endswith("/"):
  244. baseurl += "/"
  245. proto, host, basedir = urlparse(baseurl)[:3]
  246. if proto not in ["ftp", "http"]:
  247. log("Error: Only ftp:// and http:// urls are supported")
  248. sys.exit(1)
  249. def get_file(fname, rsize):
  250. """Retrieve file via FTP or HTTP
  251. @param fname: Relative path to remote file
  252. @param rsize: Size of remote file, or None (to query the size)
  253. """
  254. # Create directory if necessary
  255. if proto == "http":
  256. url = None
  257. # Only fetch if new size is different
  258. if fname in mirror.present:
  259. lsize = mirror.present[fname]
  260. try:
  261. if rsize is None:
  262. if proto == "ftp":
  263. rsize = int(ftp.size(fname))
  264. elif proto == "http":
  265. url = urlopen(urljoin(baseurl, quote(fname)))
  266. rsize = int(url.info()['content-length'])
  267. except (IOError, ftplib.all_errors), e:
  268. log("Failed to get size of remote file %s (%s)", fname, str(e))
  269. return False
  270. if lsize == rsize:
  271. log("Already retrieved %s", fname)
  272. return False
  273. else:
  274. log("Size has changed (%d to %d) for %s", lsize, rsize, fname)
  275. else:
  276. dname = os.path.dirname(fname)
  277. if not os.path.isdir(dname):
  278. os.makedirs(dname)
  279. # Fetch file
  280. log("Retrieving %s", fname)
  281. outfile = open(fname, "wb")
  282. t0 = time.time()
  283. try:
  284. sys.stdout.flush()
  285. if proto == "ftp":
  286. ftp.retrbinary("RETR %s" % fname, outfile.write)
  287. elif proto == "http":
  288. if url is None:
  289. url = urlopen(urljoin(baseurl, quote(fname)))
  290. rsize = int(url.info()['content-length'])
  291. outfile.write(url.read(rsize))
  292. t1 = time.time()
  293. size = outfile.tell()
  294. except (IOError, ftplib.all_errors), e:
  295. log("Error: failed to retrieve %s (%s)" , fname, str(e))
  296. return False
  297. finally:
  298. outfile.close()
  299. # Calculate speed
  300. speed = 0
  301. if (t1-t0) > 0:
  302. speed = int((size/1000)/(t1-t0))
  303. log("Wrote %d bytes in %.2f seconds (%d KB/s)", size, t1-t0, speed)
  304. # Check download size
  305. if rsize is not None and size != rsize:
  306. log("Error: retrieved %d bytes, but size should be %d", size, rsize)
  307. return False
  308. # All good, add the file to the mirror
  309. mirror.add(fname)
  310. return True
  311. # Log into the host
  312. if proto == "ftp":
  313. log("FTP logging in to %s to transfer %d files", host, len(flist))
  314. ftp = ftplib.FTP(host)
  315. ftp.login()
  316. ftp.cwd(basedir)
  317. elif proto == "http":
  318. log("Beginning HTTP transfer of %d files from %s", len(flist), baseurl)
  319. # Loop over the files
  320. success = []
  321. for fname, rsize in flist:
  322. if get_file(fname, rsize):
  323. success.append((fname, rsize))
  324. if proto == "ftp":
  325. ftp.quit()
  326. return success
  327.  
  328.  
  329.  
  330. def list_packages(archs, dists, components, packages_ext="bz2"):
  331. """Return a list of Release and Packages files for architectures
  332. in archs, distributions in dists and the given list of components.
  333.  
  334. These files can be fetched using download_files, and the Packages.bz2
  335. files can then be parsed with parse_packages.
  336. """
  337. flist = []
  338. def add(fname):
  339. flist.append((fname,None))
  340. for arch in archs:
  341. for dist in dists:
  342. dist_dir = "dists/%(dist)s" % locals()
  343. add("%(dist_dir)s/Release" % locals())
  344. add("%(dist_dir)s/Contents-%(arch)s.gz" % locals())
  345. for component in components:
  346. arch_dir = "%(dist_dir)s/%(component)s/binary-%(arch)s" % locals()
  347. add("%(arch_dir)s/Packages.%(packages_ext)s" % locals())
  348. add("%(arch_dir)s/Release" % locals())
  349. return flist
  350.  
  351.  
  352.  
  353. def parse_packages(packages_files):
  354. """Parse the list of pool/x/y/z.deb files and list of sizes.
  355.  
  356. @param packages_files: A list of relative paths. Those whose
  357. basename is Packages.bz2 are parsed for 'Filename: ' and 'Size: '
  358. lines.
  359.  
  360. @return: List of debian packaages, and a list of their sizes.
  361. """
  362. fnames, fsizes = [], []
  363. fn_str = "Filename: "
  364. fn_len = len(fn_str)
  365. sz_str = "Size: "
  366. sz_len = len(sz_str)
  367. def parse_file(f):
  368. for line in f:
  369. # Trust "Filename:" to always have corresponding "Size:"
  370. if line.startswith(fn_str):
  371. fnames.append(line[fn_len:-1])
  372. elif line.startswith(sz_str):
  373. fsizes.append(int(line[sz_len:-1]))
  374.  
  375. for lname in packages_files:
  376. # Decide whether we're parsing .bz2 or .gz
  377. if os.path.basename(lname) == "Packages.bz2":
  378. log("Parsing: %s", lname)
  379. with closing(bz2.BZ2File(lname,"r")) as f:
  380. parse_file(f)
  381. elif os.path.basename(lname) == "Packages.gz":
  382. with closing(gzip.GzipFile(lname,"r")) as f:
  383. parse_file(f)
  384. return zip(fnames, fsizes)
  385.  
  386.  
  387.  
  388. def exclude_debs(debfiles, langs, pkgregex):
  389. """Filter a list of debian packages using some exclusion rules
  390.  
  391. @param debfiles: Pairs of
  392. pool/component/letter/package/name_version_arch.deb and
  393. file sizes.
  394.  
  395. @param langs: Set of language codes of the form '-de' to exclude
  396.  
  397. @param pkgregex: Regular expression - exclude matching package directories
  398.  
  399. @return: Pairs of (name, size) for those that made it through the filter
  400. """
  401. total_pkgs = 0
  402. total_size = 0
  403. for fname, fsize in debfiles:
  404. # Bits from pool/component/letter/package/name_version_arch.deb
  405. bits = fname.rsplit("/",4)
  406. langcode = bits[4].split("_",1)[0][-3:]
  407. pkgdir = bits[3]
  408. # Check last 3 letter of name against language
  409. if langcode in langs:
  410. log("Excluding %s (%s bytes) [langcode]", fname, fsize)
  411. total_pkgs += 1
  412. total_size += fsize
  413. elif pkgregex.match(pkgdir):
  414. log("Excluding %s (%s bytes) [pkgregex]", fname, fsize)
  415. total_pkgs += 1
  416. total_size += fsize
  417. else:
  418. yield (fname, fsize)
  419. log("Excluded %d packages, for %d bytes", total_pkgs, total_size)
  420.  
  421.  
  422.  
  423. #######################################################################
  424.  
  425. def scriptmain(args):
  426. """Main function for the script, call with contents of sys.argv"""
  427. # Set up default options
  428. archs = []
  429. baseurl = []
  430. destdir = "."
  431. dists = []
  432. components = []
  433. exclude_file = ""
  434. langs = []
  435. regenerate = False
  436. # Parse command options
  437. optlist, args = getopt(args[1:], "",
  438. ["archs=", "base=", "components=", "dest=", "dists=",
  439. "exclude=", "help", "langs=", "license", "regenerate"])
  440. # Split on commas, but avoids [""]
  441. def commasplit(string):
  442. return [] if string == "" else string.split(",")
  443. for key, value in optlist:
  444. if key == "--archs":
  445. archs = commasplit(value)
  446. elif key == "--base":
  447. baseurl = value
  448. elif key == "--components":
  449. components = commasplit(value)
  450. elif key == "--dest":
  451. destdir = value
  452. elif key == "--dists":
  453. dists = commasplit(value)
  454. elif key == "--exclude":
  455. exclude_file = value
  456. elif key == "--help":
  457. print __doc__
  458. sys.exit(0)
  459. elif key == "--langs":
  460. langs = commasplit(value)
  461. for lang in langs:
  462. if not lang.isalpha() or len(lang) != 2:
  463. log("Invalid language code '%s', exiting", lang)
  464. sys.exit(1)
  465. elif key == "--license":
  466. print license
  467. sys.exit(0)
  468. elif key == "--regenerate":
  469. regenerate = True
  470.  
  471. # Change to destination directory and read mirror
  472. os.chdir(destdir)
  473. mirror = UbuntuMirror()
  474.  
  475. # Perform regeneration instead of other actions
  476. if regenerate:
  477. mirror.regenerate()
  478. sys.exit(0)
  479.  
  480. # Check that required arguments are provided
  481. if not archs:
  482. log("Please provide at least one architecture (--archs)!")
  483. sys.exit(1)
  484. if not baseurl:
  485. log("Please provide a base URL (--base)!")
  486. sys.exit(1)
  487. if not components:
  488. log("Please provide at least one component (--components)!")
  489. sys.exit(1)
  490. if not dists:
  491. log("Please provie at least one distribution (--dists)!")
  492. sys.exit(1)
  493.  
  494. # Exclude language codes
  495. if langs:
  496. exclude_langs = set(language_list) - set(["-"+x for x in langs])
  497. else:
  498. exclude_langs = set()
  499. # Exclude language packs
  500. if langs:
  501. exclude_lang_packs = re.compile(
  502. r"^language-pack(?:-kde|-gnome|)-(?!%s)[a-z][a-z](?:-base|)$"
  503. % "|".join(langs))
  504. else:
  505. exclude_lang_packs = re.compile(r"^$")
  506.  
  507. # Get the distribution packages files
  508. log("Creating list of required packages files")
  509. package_files = list_packages(archs, dists, components, packages_ext="bz2")
  510. log("\nDownloading required packages files")
  511. download_files(baseurl, package_files, mirror)
  512. # Parse names of current debfiles from the packages files
  513. log("\nParsing packages files for debs")
  514. debfiles = parse_packages(x[0] for x in package_files)
  515. # Exclusions based on package name or language ending
  516. log("\nFiltering debs for exclusions")
  517. debfiles = list(exclude_debs(debfiles, exclude_langs, exclude_lang_packs))
  518. # Download the remaining .deb files
  519. log("\nDownloading filtered debs (only size has changed)")
  520. fetched = download_files(baseurl, debfiles, mirror)
  521. # Update the mirror
  522. log("\nRemoving files which are no longer on mirror")
  523. mirror.prune(set([x[0] for x in package_files] + [x[0] for x in debfiles]))
  524. mirror.write_listing()
  525.  
  526. if __name__ == "__main__":
  527. scriptmain(sys.argv)

Report this snippet  

You need to login to post a comment.