You can do this similar to the BIDE approach. It can be done like this:
class TreeNode:
def __init__(self, element, depth, count=0, parent=None):
self.count= count
self.element= element
self.depth= depth
self.subnodes= dict()
self.parent= parent
def __repr__(self):
return f'{self.__class__.__name__}({self.element}, {self.depth}, {self.count})'
def get_subnode(self, element, create=True):
if create is True:
return self.subnodes.setdefault(element, TreeNode(
element,
self.depth+1,
count=0,
parent=self
)
)
else:
return self.subnodes.get(element)
def get_subnode_increasing_count(self, element):
subnode= self.get_subnode(element)
subnode.count+= 1
return subnode
def harvest(self, N, m, M, prefix=[], append_result_to=None):
# get all sequences which occurs more than N times
# and have a length (depth) of m < depth < M
prefix= prefix.copy()
prefix_len= len(prefix)
if append_result_to is None:
result= list()
else:
result= append_result_to
if self.element is not None:
prefix.append(self.element)
prefix_len+= 1
if N >= self.count or prefix_len >= M:
return result
if N < self.count and m < prefix_len and prefix_len < M:
# the subsequence matches the constraints
result.append(prefix)
for subnode in self.subnodes.values():
subnode.harvest(
N,
m,
M,
prefix=prefix,
append_result_to=result
)
return result
def get_prefix(self):
if self.parent is None:
prefix= []
else:
prefix= self.parent.get_prefix()
if self.element is not None:
prefix.append(self.element)
return prefix
def print_tree_counts(self, leaves_only=False, min_count=0):
if leaves_only is False or not self.subnodes:
if self.count >= min_count:
print(self.get_prefix(), self.count)
for subnode in self.subnodes.values():
subnode.print_tree_counts(leaves_only=leaves_only, min_count=min_count)
sequential approach
def find_patterns(S, N, m, M):
root= TreeNode(None, 0, 0)
active_nodes= []
for el in S:
root.count+=1
# append the root node
active_nodes.append(root)
# now replace all nodes in active nodes by the node
# that is reached one level deeper following el
active_nodes= [node.get_subnode_increasing_count(el) for node in active_nodes if node.depth < M]
# now harvest the tree after applying the restrictions
return root, root.harvest(N, m, M)
S='AAFFAABDEFEDFFAAFCCABDDCCCABDDCACBEABCFFAAEABCADEFFAABCDAEABCABDD'
root, patterns= find_patterns(S, 3, 3, 6)
patterns
The result is:
[['F', 'F', 'A', 'A']]
Your second pattern occurs exactly 3 times and so doesn't fullfill the requirement of > 3 occurances ([C,A,B,D,D]).
modifications for parallel processing
To make it processable in parallel you can do a slide modification. Just create another method in TreeNode, that allows to merge nodes. Like this:
def merge_nodes(self, other_nodes):
# merge other_nodes into this node
# including all subnodes
if len(other_nodes) > 0:
elements= set()
for other_node in other_nodes:
self.count+= other_node.count
elements.update(other_node.subnodes.keys())
# elements now contains the set of the next elements
# with which the sequence continues across the
# other nodes
for element in elements:
# get the node of the resulting tree that represents
# the sequnce continuing with element, if there is
# no such subnode, create one, since there is at least
# one other node that counted sequence seq + element
my_subnode= self.get_subnode(element, create=True)
other_subnodes= list()
for other_node in other_nodes:
# now get the subnode for each other node, that
# represents the same sequence (seq + element)
other_subnode= other_node.get_subnode(element, create=False)
if other_subnode is not None:
other_subnodes.append(other_subnode)
# merge the subnode the same way
my_subnode.merge_nodes(other_subnodes)
Now you can call find_patterns on subtrees, but you need to take into account, that you may not split the input sequence normally. You need to define some overlapping sequence part, so that patterns, that begin at the end of one sequence-fragment can be completed but that this overlapping sequences need to be counted differently (otherwise you get double-counts, which lead to wrong inputs). So you have to make sure, that only patterns, which began in the sequence fragment are continued with the overlap part, but that no new patterns are started in the overlap part, because you count them elsewere:
def find_patterns(S, N, m, M, overlap=[]):
root= TreeNode(None, 0, count=0, parent=None)
active_nodes= []
def process_element(active_nodes, element, M):
return [
node.get_subnode_increasing_count(element)
for node in active_nodes
if node.depth < M
]
for el in S:
root.count+=1
# append the root node
active_nodes.append(root)
# now replace all nodes in active nodes by the node
# that is reached one level deeper following el
active_nodes= process_element(active_nodes, el, M)
# complete the already started sequences with the
# overlapping sequence (the sequence that may be
# processed in another process)
for el in overlap:
active_nodes= process_element(active_nodes, el, M)
# now harvest the tree after applying the restrictions
return root, root.harvest(N, m, M)
Now you can do:
split_point= len(S) // 2
S1= S[split_point:]
S2= S[:split_point]
S1_overlap= S2[:10]
# in fact you could have just used :5 above,
# but that doesn't affect the result
r1, _= find_patterns(S1, 3, 3, 6, overlap=S1_overlap)
# Note: you could safely start the comand in the next
# line in parallel to the previous call of find_patterns
# and of course, you could split your sequence in as
# many fragments, as you like, as long as you maintain
# the overlap correctly and call merge_nodes afterwards
r2, _= find_patterns(S2, 3, 3, 6)
r1.merge_nodes([r2])
patterns= r1.harvest(3, 3, 6)
This yields the same result as above. To make it a bit clearer, what I mean with the overlap part:
S1 is 'CBEABCFFAAEABCADEFFAABCDAEABCABDD'
S2 is 'AAFFAABDEFEDFFAAFCCABDDCCCABDDCA'
S1_overlap is 'AAFFAABDEF'
With find_patterns(S1, 3, 3, 6)
you would only search for repeated patterns in S1, so it would not consider patterns which begin with the part BDD (starting at the end of S1) and are continued within S2. With find_patterns(S1, 3, 3, 6, overlap=S1_overlap)
I consider these patterns as well.