Django, dynamic schemata, MSSQL, and database-level multi-tenancy, part 2

It's been a couple of months, but I finally remembered that I wrote part one of a multipart post without actually putting any thought into the remaining parts. In part 1 of this series, I described our data tier's layout and the problems that we have around putting a decent framework in front of it. I got the DATABASES setting to correctly handle the conversion between MSSQL connection strings and the database configuration object that Django's ORM expects, and got database access to any of our databases through Django's Model.objects.using('databasename') syntax.

The next challenge to overcome was that of dynamic schemata: models that exist in multiple databases and have fields that can be added or removed by the system's users. At any given time, there are no guarantees on which columns are present in certain tables. Therefore, we have to determine at query-time which items to retrieve and which ones to update.

This, of course, seemed like unnecessary overhead in the average case. In most cases, there is little to no need to pull back all of these dynamic columns when only the base columns are needed, so I opted for a second, optional call to retrieve the dynamic fields after the base object has been retrieved.

I first set up a base class for our models to give some across-the-board functionality.

import copy  
from django.db import models  
from sfdb.database.query import query, nonquery


class SfBaseModel(models.Model):  
    def _get_field_column_names(self):
        return [field.db_column for field in self._meta.fields]

    def _get_column_names_from_db(self):
        # Pull all column names from INFORMATION_SCHEMA for the current model's 
        # table name.
        information_schema_query = """SELECT COLUMN_NAME
                                      FROM INFORMATION_SCHEMA.COLUMNS
                                      WHERE TABLE_NAME = '%s'""" 
                                          % self._meta.db_table

        results = query(
            query_string=information_schema_query,
            using=self._state.db)

        return [r['COLUMN_NAME'] for r in results]

    @classmethod
    def _get_model_field_names(cls):
        return [f.name for f in cls._meta.fields]

    @classmethod
    def get_field_db_column_name(cls, field_name):
        for f in cls._meta.fields:
            if f.name == field_name:
                return f.db_column
        return None

    @classmethod
    def get_field_name_by_db_column(cls, db_column):
        c_name = db_column.lower()
        for f in cls._meta.fields:
            if f.db_column.lower() == c_name:
                return f.name
        return None

    class Meta:
        abstract = True

Then I was able to write an inheriting model class that implemented the Dynamic model functionality and overrode the base save functionality.

class DynamicSchemaModel(SfBaseModel):  
    _dynamic_field_columns = None
    _dynamic_field_dbstate = None
    _dynamic_fields = None

    custom_fields = property(lambda self: self._get_dynamic_fields())

    def _diff_dynamic_fields(self):
        diff = {}
        for k, v in self._dynamic_field_dbstate.iteritems():
            if self._dynamic_fields[k] != v:
                diff[k] = self._dynamic_fields[k]
        return diff

    def sync_dynamic_fields(self, force_refresh=False):
        # Cache field names on the object; it's unlikely that we'll have a 
        # change in schema between calls.
        # TODO: For performance, these should probably be cached somewhere with 
        # a wider scope.
        if force_refresh or self._dynamic_field_columns is None:
            db_table_columns = self._get_column_names_from_db()
            model_columns = self._get_field_column_names()

            self._dynamic_field_columns = list(
                set(db_table_columns).difference(model_columns))

        column_name_string = ','.join(self._dynamic_field_columns)

        # PK should always be an Auto or Integer field currently, unless this is 
        # expanded to tables other than:
        # Contacts, Accounts, Opportunities
        query_string = """SELECT %s FROM %s WHERE %s = %s"""

        dynamic_fields_dict = list(query(query_string,
                                         self._state.db,
                                         column_name_string,
                                         self._meta.db_table,
                                         self._meta.pk.db_column,
                                         str(self.pk)))[0]
        self._dynamic_field_dbstate = dynamic_fields_dict
        self._dynamic_fields = copy.copy(self._dynamic_field_dbstate)

    def _get_dynamic_fields(self):
        if self._dynamic_fields is None:
            self.sync_dynamic_fields()
        return self._dynamic_fields

    def get_field_value(self, field_name):
        value = getattr(self, field_name, None)
        if not value:
            value = self.custom_fields.get(field_name, None)
        return value

    def get_attribute_by_column(self, column_name, default=None):
        field_name = self.get_field_name_by_db_column(column_name)
        if field_name:
            val = getattr(self, field_name)
        else:
            val = self.custom_fields.get(field_name, None)
        return val

    def save(self, *args, **kwargs):
        # We need to save the dynamic fields to the database after the base
        # fields have been saved, but only if needed
        super(DynamicSchemaModel, self).save(*args, **kwargs)

        if not (self._dynamic_fields and self._dynamic_field_dbstate):
            return

        # XOR the dicts to get the modified things
        modified_fields = set(self._dynamic_field_dbstate.items()) ^ set(self._dynamic_fields.items())

        if len(modified_fields) > 0:
            # we have modified dynamic fields, so we need to save them
            update_query = """UPDATE %s SET %s WHERE %s = %s"""

            dict_diff = self._diff_dynamic_fields()
            assignments = []
            for k, v in dict_diff.iteritems():
                try:
                    float(str(v))
                    assignments.append("%s = %s" % (k, str(v)))
                except ValueError:
                    assignments.append("%s = '%s'" % (k, str(v)))

            assignments_string = ','.join(assignments)

            update_query = update_query % (
                self._meta.db_table, assignments_string, self._meta.pk.db_column, str(self.pk))

            nonquery(update_query, using=self._state.db)

            self._dynamic_field_dbstate = copy.copy(self._dynamic_fields)

    class Meta(SalesfusionModel.Meta):
        abstract = True

I know I promised to get into the custom authentication backend in part 2, but I think that the above wall of code is good enough. Next time!

Django, dynamic schemata, MSSQL, and database-level multi-tenancy, part 1

We have a rather uniquely designed database tier in my current organization, and it causes some interesting problems when designing new components.

Our data tier currently consists of n servers running Microsoft SQL Server 2008, each running m discrete databases. Each customer has their own database, with its own schema which can differ based on each customer’s configuration, as we literally ALTER TABLE based on both in-app configuration and what we observe when syncing custom fields from the customer’s CRM via its API. For example, Customer A may use our default Contacts table with no additional custom columns, and that would have, say, 50 columns total. Customer B might have synced down all of their custom fields from Salesforce, resulting in a whopping 800 columns. It’s utterly unpredictable, and would have been a perfect use case for a NoSQL solution or, my personal preference, Postgres with hstore.

We’ve been moving toward a Django-based API and processing tier for some time now, and had successfully run Django's inspectdb to reverse engineer our default schema. Just getting the Django ORM to work with MSSQL had its own caveats, however. Considering that we generally do our development on the Python side of things with Macs and deploy to Linux servers in Amazon Web Services, we needed a Django database backend for SQL Server that didn't rely on Windows-specific implementations (I'm looking at you, ADO.NET).

On a side note, when you do a Google search for 'ADO', the first result is the definition of the English word 'ado', which is, rather hilariously:

a·do (/əˈdo͞o/), Noun: trouble or difficulty.

So, without further ado, let's drill into implementation.

First things first, we need a way to distinguish at runtime which database we want to query via Django's ORM. Normally, you would have something like the following in your settings.py -- a DATABASES dict that contains configuration for all of the databases that your Django application needs to access.

DATABASES = {  
    'default': {
        'ENGINE': 'django.db.backends.sqlite3',
        'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
    },
}

However, this proves problematic if you don't know about your databases until runtime. Our architecture here revolves around having a central 'controller' database that contains a table of customers and their respective MSSQL connection information.

So, rather than dropping a static DATABASES object into our configuration, we need something that can obtain a particular database's connection details at runtime. In order to accomodate this database-level multi-tenancy, we first wrote up this DatabasesShim class.

class DatabasesShim(dict):  
    def __init__(self, *arg, **kw):
        super(DatabasesShim, self).__init__(*arg, **kw)

        # These regexes extract database name, hostname, and credentials from an MSSQL
        # connection string.
        self.mssql_db_name_pattern = re.compile(r"database=(?P<dbname>[^$;\s]*)", re.I)
        self.mssql_db_host_pattern = re.compile(r"server=(?P<dbhost>[^$;\s]*)", re.I)
        self.mssql_user_pattern = re.compile(r"uid=(?P<uid>[^$;\s]*)", re.I)
        self.mssql_pass_pattern = re.compile(r"pwd=(?P<pwd>[^$;\s]*)", re.I)

    def __getitem__(self, key):
        # Overloads the default dict `__getitem__` function.
        # When a dict attribute of the DatabasesShim is accessed, we try to obtain a database connection for the key
        try:
            item = super(DatabasesShim, self).__getitem__(key)

        except KeyError:
            from sfdb.sfcontroller.models import Customer

            template = self.get('template').copy()

            try:
                cid = key

                # We can access a database's connection by using either the customer's primary key, or their domain name, which is unique to their account.
                try:
                    cid = int(key)
                    customer = Customer.objects.using("sfcontroller").get(customer_id=cid)
                except Exception, e:
                    customer = Customer.objects.using("sfcontroller").get(domainname=cid)

                connection_string = getattr(customer, 'sqlconnectstring')

                dbname_matches = self.mssql_db_name_pattern.search(connection_string)
                dbhost_matches = self.mssql_db_host_pattern.search(connection_string)
                pass_matches = self.mssql_pass_pattern.search(connection_string)
                user_matches = self.mssql_user_pattern.search(connection_string)

                template['NAME'] = dbname_matches.group('dbname')
                template['HOST'] = dbhost_matches.group('dbhost')
                template['USER'] = user_matches.group('uid')
                template['PASSWORD'] = pass_matches.group('pwd')

                return template
            except ObjectDoesNotExist:
                return None
        return item

This way, we can instead instantiate our DATABASES setting as follows:

DATABASES = DatabasesShim(  
    template={
        'ENGINE': 'sqlserver',
        'CONN_MAX_AGE': 6000,
        'AUTOCOMMIT': True,
        'TEST_MIRROR': 'default',
        'OPTIONS': {
            'host_is_server': True,
            'login_timeout': 1800,
            'use_legacy_date_fields': True,
        },
    },
    sfcontroller={
        'NAME': 'Controller',
        'ENGINE': 'sqlserver',
        'HOST': 'xxxxxxxxx',
        'USER': 'xxxxxxxxx',
        'PASSWORD': 'xxxxxxxxx',
        'TEST_MIRROR': 'default',
        'OPTIONS': {
            'host_is_server': True,
            'use_legacy_date_fields': True,
        },
    },
    default={
        'ENGINE': 'django.db.backends.sqlite3',
        'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
    }
)

With these in place, we can easily access any database via Django's Model.objects.using(db_name) syntax.

In part 2, I'll go into our implementation of a Django authentication backend that authenticates users across databases and how we handled the tables with dynamic, unpredictable schemata.