from __future__ import with_statement """ datastore.py Datastore is similar to the regular Django cache, except that it keeps the last update to the cache in a second, file-based location, in order to protect against failures updating the cache. For example, when using a remote datasource that may fail, the datastore will always keep a second copy of the last good update. If the remote source fails on the next update (i.e. an RSS feed is unavailable or there is a network failure), the backup copy is returned instead. The datastore works with the Django cache. The default cached copy is stored in the cache. The file-based backup protects against even a cache error. One difference in semantics between datastore and cache is that datastore has only one public function - get - which accepts a key, a function reference to be called to generate a new copy if the cached one is not available or is stale, and a length of time (in seconds) to pass to cache.set. Example: data = datastore.get('foo', get_more_foo, 5*60) Here is the work-flow: 1. client calls datastore.get('foo', get_more_foo, 5*60) 2. cache is checked for data keyed 'foo' 3. if cache returns None, get_more_foo is called to create a new object 4. if get_more_foo returns successfully, the new object is stored in the cache for 5*60 seconds and backed up to disk (using pickle), and the new object is returned 5. if get_more_foo raises an exception, the backup copy is returned, and is stored in the cache for half the default time ((5*60)/2) 6. if there is no backup copy (e.g. the first time datastore.get is called, there is a failure), only then is the exception raised by get_more_foo propagated. Three settings are required (in your settings.py file): DATASTORE_DIR = where to store backups (/tmp is a good location) DATASTORE_CULL_TIME = backups deleted after this many seconds of disuse (checked against atime) DATASTORE_CULL_AFTER = number of datastore.get calls between culls (deleting old backups) """ import os, tempfile, datetime, threading try: import cPickle as pickle except ImportError: import pickle from django.core.cache import cache from django.conf import settings # These values MUST be set in settings.py # DATASTORE_DIR = path to datastore directory # DATASTORE_CULL_TIME = after n seconds of no access, cull backup file # DATASTORE_CULL_AFTER = after n get calls, cull DATASTORE_DIR assert os.path.isdir(settings.DATASTORE_DIR), 'Cannot access settings.DATASTORE_DIR' assert isinstance(settings.DATASTORE_CULL_TIME, int), 'Cannot access settings.DATASTORE_CULL_TIME' assert isinstance(settings.DATASTORE_CULL_AFTER, int), 'Cannot access settings.DATASTORE_CULL_AFTER' EXT = 'ds_pkl' LOCK_FILE_PATH = '%s/ds_pkl.lock' % settings.DATASTORE_DIR class Counter(object): """This counter isn't great. Django runs as multiple apache processes which cannot communicate, so we have a race to cull when multiple counters hit DATASTORE_CULL_AFTER, but hell, nothing's perfect. This class just makes sure the count is thread-safe. """ def __init__(self, start=0): self.count = start self.lock = threading.RLock() def inc(self, n=1): with self.lock: self.count += n def dec(self, n=1): with self.lock: self.count -= n def reset(self, n=0): with self.lock: self.count = n def get_count(self): with self.lock: return self.count class LockFile(object): """Simple file lock with the same semantics as a mutex. Not recursively safe. """ def __init__(self, path): self.has_lock = False self.path = path def __enter__(self): self.acquire() def __exit__(self, etyp, einst, etb): self.release() def locked(self): """Predicates that the lock file exists. """ return os.path.isfile(self.path) def valid(self, seconds): """Predicates that the lock file exists and is no older (mtime) than seconds. """ return self.locked() and (time.time() - os.path.getmtime(self.path)) <= seconds def delete_lock_file(self): """Removes the lock file. Does not check for lock file's existence. """ os.unlink(self.path) def write_lock_file(self): """Writes the lock file. Does not check for lock file's existence. """ with open(self.path, 'w') as f: f.write('@@@') def acquire(self): """Acquires (creates) the lock file. Blocks until lock available. """ while self.locked(): time.sleep(0.01) self.write_lock_file() self.has_lock = True def release(self): """Releases (deletes) the lock file. """ if self.has_lock: self.delete_lock_file() self.has_lock = False def cull(): """Deletes old data store files from DATASTORE_DIR. """ cull_after = datetime.timedelta(seconds=settings.DATASTORE_CULL_TIME) now = datetime.datetime.now() with LockFile(LOCK_FILE_PATH): files = os.listdir(settings.DATASTORE_DIR) for f in files: root, ext = os.path.splitext(f) if ext == '.%s' % EXT: f_path = '%s/%s' % (settings.DATASTORE_DIR, f) atime = os.stat(f_path).st_atime if (now - datetime.datetime.fromtimestamp(atime)) > cull_after: with LockFile(backup_filename('%s.lock' % root)): # root == key os.unlink(f_path) def backup_filename(key): """Returns a filename to store a pickled object in based on key. """ return '%s/%s.%s' % (settings.DATASTORE_DIR, key, EXT) def has_backup(key): """Predicates that a pickled object file exists for this key. """ return os.path.isfile(backup_filename(key)) def get_backup(key): """Unpickles the data from the backup file. Raises IOError if the file does not exist. """ with LockFile(backup_filename('%s.lock' % key)): with open(backup_filename(key), 'rb') as f: return pickle.load(f) def set_backup(key, content): """Pickles content to file based on key. Raises IOError if the file does not exist. """ with LockFile(LOCK_FILE_PATH): with LockFile(backup_filename('%s.lock' % key)): target = backup_filename(key) temp_fd, temp_path = tempfile.mkstemp() temp_f = None try: temp_f = os.fdopen(temp_fd, 'wb') pickle.dump(content, temp_f) finally: if temp_f is not None: temp_f.close() # do file ops as atomically as possible if os.path.isfile(target): os.rename(target, '%s.old' % target) # rename old backup os.rename(temp_path, target) # move tmp to backup os.unlink('%s.old' % target) # remove old backup else: os.rename(temp_path, target) # move tmp to backup cull_counter = Counter(0) def get(key, get_new, store_time, use_cache=True): """Similar to cache.get, datastore.get attempts to load a pickled object from the Django cache using key. However, if cache.get returns None, the datastore will use get_new to create a new object, which it will then store in the cache. The big difference between datastore and cache is that a physical backup of the last stored object is kept, so that in case get_new fails, the most recent valid version of the object is returned. """ assert callable(get_new), "get_new must be callable" assert isinstance(store_time, int), "store_time must be an integer" cull_counter.inc() if use_cache: content = cache.get(key) if content is None: # content is not cached try: # attempt to get new content = get_new() cache.set(key, content, store_time) set_backup(key, content) except Exception, e: # failing that, get backup if has_backup(key): content = get_backup(key) cache.set(key, content, store_time/2) # only set backup for half time else: # if no backup, continue with exception raise e else: if has_backup(key): # Always get backup if possible. content = get_backup(key) else: # It's no use catching exceptions here; there is no alternate action since it has no backup. content = get_new() set_backup(key, content) # Only cull after any backup has been written if cull_counter.get_count() % settings.DATASTORE_CULL_AFTER == 0: cull() return content