tsunami

log in
history

Haskell DB dependency stuff

Luke Breuer
2010-04-23 11:16 UTC

import List

data RelName = RN { r_name :: String }
  deriving Eq

data Attr = A { a_name :: String }
  deriving (Eq, Ord)
  
type Attrs = [Attr]

data Dep = D { determines, implied :: Attrs }
  deriving (Eq, Ord)

data Rel = R { r :: RelName, attrs :: Attrs, ckeys :: [Attrs], deps :: [Dep]  }

instance Show Dep where  
  show (D d i) = (show_attrs d) ++ " -> " ++ (show_attrs i)
    where
      single_chars a = (length a) == (sum $ map (length . a_name) a)
      show_attrs a | single_chars a = foldl (\x (A a) -> x ++ a) "" a 
                   | otherwise      = if (length a) == 1 then show $ head a else show a
        
instance Show Attr where
  show (A n) = n
  
instance Show RelName where
  show (RN r) = show r
  
instance Show Rel where
  show (R r a k d) = "{r = "  ++ show r ++ 
                     ", a = " ++ show a ++ 
                     ", k = " ++ show k ++
                     ", d = " ++ show d ++
                     "}"

print_rels :: [Rel] -> IO ()
print_rels rels = mapM_ putStrLn $ map (("  " ++) . show) rels

print_deps :: [Dep] -> IO ()
print_deps deps = mapM_ putStrLn $ map (("  " ++) . show) deps

print_depss :: [[Dep]] -> IO ()
print_depss = sequence_ . (foldl folder [])
  where
    folder [] ys = [print_deps ys]
    folder xs ys = xs ++ ((putStrLn "--------------") : print_deps ys : [])

m_D :: Attrs -> Attrs -> Dep
m_D d i = D (sort d) (sort i)
                     
m_attrs :: [String] -> [Attr]
m_attrs = map A
  
m_dep :: [String] -> [String] -> Dep
m_dep determines implied = D (m_attrs determines) (m_attrs implied)

m_rel :: String -> [String] -> [[String]] -> [([String], [String])] -> Rel
m_rel name attrs ckeys deps = R (RN name) (m_attrs attrs) ckeys_ deps_
  where
    ckeys_ = map m_attrs         ckeys
    deps_  = map (uncurry m_dep) deps

subset :: Eq a => [a] -> [a] -> Bool
subset a b = all (\x -> elem x b) a

properSubset :: Eq a => [a] -> [a] -> Bool
properSubset a b = subset a b && (length a) < (length b)

unions :: (Eq a) => [[a]] -> [a]
unions = fold union

subsets :: [a] -> [[a]]
subsets []     = [[]]
subsets (x:xs) = subsets xs ++ map (x :) (subsets xs)

-- Returns all the subsets which contain one fewer element.
-- If passed
--   1 element : []
--   0 elements: error
subsets_1_ex :: [a] -> [([a], a)]
subsets_1_ex [] = error "expected at least one element"
subsets_1_ex (x:xs) = takeWhile not_end $ map lister $ iterate next ([], x, xs)
  where
    next (xs, y, z:zs) = (y:xs, z, zs)
    next (_,  y, []  ) = ([], y, [])
    lister (xs, y, zs) = (xs ++ zs, y)
    not_end ([], _)    = False
    not_end (_,  _)    = True
    []     ++ ys       = ys
    (x:xs) ++ ys       = xs ++ (x : ys) -- redefine to reverse while appending (less efficient)
    
subsets_1 :: [a] -> [[a]]
subsets_1 xs = map fst $ subsets_1_ex xs

set_eq :: Ord a => [a] -> [a] -> Bool
set_eq xs ys = (sort xs) == (sort ys)

nonempty_subsets :: [a] -> [[a]]
nonempty_subsets = filter ((> 0) . length) . subsets

cartesian :: [a] -> [a] -> [(a, a)]
cartesian a b = [(x, y) | x <- a, y <- b]    

is_trivial :: Dep -> Bool
is_trivial (D d i) = subset i d

find_bad_bcnf_deps :: Rel -> [Dep]
find_bad_bcnf_deps (R _ a _ deps) = 
    filter (\d -> not (is_trivial d || is_superkey d)) deps
  where
    ckeys = candidate_keys a deps
    is_superkey (D d _) = not $ null $ filter (\x -> subset x d) ckeys

is_bcnf :: Rel -> Bool
is_bcnf = null . find_bad_bcnf_deps

bcnf_decompose :: ([Dep] -> Dep) -> Rel -> [Rel]
bcnf_decompose picker r@(R n a _ deps) = if is_bcnf r then [r] else decompose
  where
    (D d i) = picker $ find_bad_bcnf_deps r
    rename suffix = RN ((r_name n) ++ suffix)
    attrs_1 = union d i
    attrs_2 = a \ (i \ d)
    deps_ a = filter ((D d i) -> subset (union d i) a) deps
    ckeys a = candidate_keys a (deps_ a)
    m_r suffix attrs = bcnf_decompose picker $ R (rename suffix) attrs (ckeys attrs) (deps_ attrs)
    decompose = m_r "_1" attrs_1 ++ m_r "_2" attrs_2

synthesize_3NF :: [Dep] -> [Rel]
synthesize_3NF deps = if any has_candidate folded 
    then folded 
    else R (RN "R") ckey [ckey] (relevant_deps ckey) : folded
  where
    cc                            = canonical_cover deps
    attrs                         = unions $ map ((D d i) -> union d i) deps
    ckey                          = head $ candidate_keys attrs cc
    has_attrs (D d i) (R _ a _ _) = subset (union d i) a
    is_relevant   a (D d i)       = (subset d a) && not (null $ intersect i a)
    make_dep      a (D d i)       = m_D d (intersect i a)
    relevant_deps a               = nub $ map (make_dep a) $ filter (is_relevant a) cc
    make_rel        (D d i)       = make_rel_2 (union d i) $ relevant_deps (union d i)
    make_rel_2    a d             = R (RN "R") a (candidate_keys a d) d
    add rels d                    = if not $ any (has_attrs d) rels then make_rel d : rels else rels
    folded                        = foldl add [] cc
    has_candidate (R _ a _ _)     = subset ckey a

-- Note that this is perhaps not what one might consider a full attribute 
-- cover; no transitivity rules are applied here.  Instead, the passed
-- dependencies are reduced to covers, which contributes to a more efficient
-- dependency inference system.
attribute_cover :: [Dep] -> [Dep]
attribute_cover deps = map agg $ groupBy same_d $ sort deps
  where
    same_d  (D d1 _) (D d2 _) = d1 == d2
    combine i (D _ i1)        = union i i1
    agg     a@((D d _):_)     = m_D d $ foldl combine [] a
    agg     []                = error "groups should always be non-empty"

-- assumes any arrays are sorted
iterate_equal :: Eq a => (a -> a) -> a -> a
iterate_equal f a = until (\x -> x == f x) f a

-- Adds attributes to each dependency until it infers all attributes.
cover :: [Dep] -> [Dep]
cover deps = union deps $ map aug deps
  where
    attrs = unions $ map ((D d i) -> union d i) deps
    aug (D d i) = m_D (d union needed) (unions [d, i, needed])
      where
        needed = attrs \ (i union d)

-- The below uses an efficient system for determining whether to apply transitivity:
--   given:   S -> T,
--            Q -> R
--   if:      T U X = Q, and
--            X U S - R is nonempty,
--   where:   X = Q - T, then our new, nontrivial dependency is:
--            S U X -> (R U X) - S
-- In addition to applying transitivity, we reduce our results to covers (see the
-- note for attribute_cover), and also augment the end result so that we have all
-- of the "interesting" dependencies that infer all _other_ attributes.  (That is,
-- we also return the dependencies that determine candidate keys.)
infer_cover :: [Dep] -> [Dep]
infer_cover = cover . iterate_equal all
  where
    is_trans ((D d1 i1), (D d2 i2)) = not $ i2 subset ((d2 \ i1) union d1)
    m_trans  ((D d1 i1), (D d2 i2)) = m_D (d1 union (d2 \ i1)) ((i1 union i2) \ d1)
    trans    d                      = map m_trans $ filter is_trans $ cartesian d d
    all      deps                   = attribute_cover $ deps union trans deps

candidate_keys_R :: Rel -> [[Attr]]
candidate_keys_R (R _ a _ d) = candidate_keys a d
    
candidate_keys :: Attrs -> [Dep] -> [[Attr]]
candidate_keys a [] = [a]
candidate_keys a d = map augment $ filter no_prop_subs all_inferred
  where
    a_all                 = sort $ intersect a $ unions $ map ((D d i) -> union d i) d
    inferred              = infer_cover d
    all_inferred          = filter ((D _ i) -> i == a_all) inferred
    no_prop_subs (D d1 _) = not $ any ((D d2 _) -> properSubset d2 d1) all_inferred
    absent                = a \ a_all
    augment (D d _)       = union d absent

-- adds trivial dependencies
explode :: Attrs -> [Dep] -> [Dep]
explode a d = foldl folder d [aug_d, aug_di, sub_i]
  where
    aug_d  (D d i) = map (\a -> m_D (union d a) i           ) $ subsets a
    aug_di (D d i) = map (\a -> m_D (union d a) (union i a) ) $ subsets a
    sub_i  (D d i) = map (m_D d)                              $ nonempty_subsets i
    folder agg f   = union agg $ unions $ map f agg

-- infers everything, including trivial dependencies
infer_full :: Attrs -> [Dep] -> [Dep]
infer_full a d = nub $ explode a (infer_cover d)
--               ^^^ why this is required is a bit of a mystery

canonical_cover :: [Dep] -> [Dep]
canonical_cover deps = if null same then cover else canonical_cover $ head same
  where
    cover         = attribute_cover deps
    sub_i (D d i) = map (\s -> D d s) $ subsets_1 i -- Dep -> [Dep]
    sub_d (D d i) = map (\s -> D s i) $ subsets_1 d -- Dep -> [Dep]
    alter :: (a -> [a]) -> ([a], a) -> [[a]]
    alter f (xs, y) = map (: xs) $ f y
    subbed f        = concat $ map (alter f) $ subsets_1_ex cover    
    all             = concat $ map subbed [sub_i, sub_d]
    infer_same a    = (infer_cover a) set_eq (infer_cover cover)
    same            = filter infer_same all

canonical_cover_splained :: [Dep] -> IO ()
canonical_cover_splained d = 
  let
    ss = cc d
  in
  do
    print_deps $ nub d
    if null ss 
      then do
        putStrLn "done!"
        return ()
      else do
        putStrLn "---"    
        canonical_cover_splained $ head ss
  where
    cc :: [Dep] -> [[Dep]]
    cc deps = same
      where
        cover         = attribute_cover deps
        sub_i (D d i) = map (\s -> D d s) $ subsets_1 i -- Dep -> [Dep]
        sub_d (D d i) = map (\s -> D s i) $ subsets_1 d -- Dep -> [Dep]
        alter :: (a -> [a]) -> ([a], a) -> [[a]]
        alter f (xs, y) = map (: xs) $ f y
        subbed f        = concat $ map (alter f) $ subsets_1_ex cover    
        all             = concat $ map subbed [sub_i, sub_d]
        infer_same a    = (infer_cover a) set_eq (infer_cover cover)
        same            = filter infer_same all
      
   
p3 :: Rel
p3 = m_rel
  "R"
  ["a", "b", "c", "d", "e"]
  [[]]
  [(["a"], ["b", "c"]),
   (["c", "d"], ["e"]),
   (["b"], ["d"]),
   (["e"], ["a"])]
   
p5 :: Rel
p5 = m_rel
  "R"
  ["a", "b", "c", "d", "e", "g"]
  [[]]
  [(["a"], ["e"]),
   (["b", "c"], ["d"]),
   (["c"], ["a"]),
   (["a", "b"], ["d"]),
   (["d"], ["g"]),
   (["b", "c"], ["e"]),
   (["d"], ["e"]),
   (["b", "c"], ["a"])]

p6 :: Rel
p6 = m_rel
  "R"
  ["course_id", "section_id", "dept", "units", "course_level", "instructor_id", 
   "term", "year", "meet_time", "room", "num_students"]
  [[]]
  [(["course_id"], ["dept", "units", "course_level"]),
   (["course_id", "section_id", "term", "year"], ["meet_time", "room", "num_students", "instructor_id"]),
   (["room", "meet_time", "term", "year"], ["instructor_id", "course_id", "section_id"])] 
        
--main :: IO ()
--main = print_deps $ canonical_cover $ deps p6