import numpy as np
import math
import random

class BourgainEmbedding:
    def __init__(self, X, d):
        """
        Initializes the Bourgain Embedding object using the Bourgain
        Embedding algorithm
        Args:
            X (np.ndarray): Data matrix of size (n, d), where n is the
                            number of points with d dimensions.
            d (np.ndarray): Distance matrix of size (n, n).
        """
        self.X = X
        self.d = d
        self.n = len(X)

        self.log_n = int(math.ceil(math.log2(self.n)))
        self.m = 576 * int(self.log_n)  # Chosen c = 576 as per the paper

        self.f_X = np.zeros((self.n, self.log_n * self.m))
        self.d_prime = self.__embed()

    def __map(self, x_idx):
        """
        Given an index x_idx, returns the value of f(x) for the point X[x_idx]
        Args:
            x_idx (int): Index of the point in X
        """
        f_x = np.zeros(self.log_n * self.m)

        for i in range(self.log_n):
            for j in range(self.m):

                included = np.random.choice(
                    [1, 0], size=self.n,
                    p=[1/(2**i), 1 - 1/(2**i)])

                S_ij = [k for k in range(self.n) if included[k] == 1]

                if len(S_ij) == 0:
                    S_ij = [x_idx]  # Ensure that S_ij is non-empty

                # Compute running minimum while iterating through S_ij
                min_distance = float('inf')
                for k in S_ij:
                    min_distance = min(min_distance, self.d[x_idx, k])

                f_x[i * self.m + j] = min_distance

        return f_x



    # Embeds the points in X into a Euclidean space of dimension m
    def __embed(self):
        # Compute the value of f(x) for each x in X
        f_X = np.zeros((self.n, self.log_n * self.m))
        for i in range(self.n):
            f_X[i] = self.__map(i)

        self.f_X = f_X

        # Compute the pairwise distances between the points in
        # the embedded space using L1 norm
        d_prime = np.zeros((self.n, self.n))
        for i in range(self.n):
            for j in range(self.n):
                d_prime[i][j] = np.linalg.norm(f_X[i] - f_X[j])

        return d_prime


    def distance(self, i, j):
        """
        Returns the pairwise distance between the points X[i] and X[j]
        in the embedded space
        Args:
            i (int): Index of the first point in X
            j (int): Index of the second point in X
        """
        return self.d_prime[i][j]

    def get_distance_matrix(self):
        """
        Returns the pairwise distance matrix of the embedded points
        """
        return self.d_prime

    def get_embedding(self):
        """
        Returns the embedded points
        """
        return self.f_X

def random_dna(length):
    return ''.join(random.choice('ACGT') for _ in range(length))

def ham_dist(seq1, seq2):
    return sum(1 for a, b in zip(seq1, seq2) if a != b)

def gen_seqs(num_sequences, sequence_length):
    sequences = [random_dna(sequence_length)]
    for _ in range(num_sequences - 1):
        new_sequence = random_dna(sequence_length)
        while any(ham_dist(new_sequence, seq) == 0 for seq in sequences):
            new_sequence = random_dna(sequence_length)
        sequences.append(new_sequence)
    return sequences

num_sequences = 100
sequence_length = 200

dnaX = gen_seqs(num_sequences, sequence_length)
d = np.zeros((num_sequences, num_sequences))
for i in range(num_sequences):
    for j in range(num_sequences):
        d[i][j] = ham_dist(dnaX[i], dnaX[j])

# Perform the embedding
embedding = BourgainEmbedding(dnaX, d)

# Print statistics (expansion/contraction and distortion of embedding)
expansions = []
contractions = []

for i in range(num_sequences):
    for j in range(num_sequences):
        if i != j:
            expansions.append(embedding.distance(i, j) / d[i, j])
            contractions.append(d[i, j] / embedding.distance(i, j))

expansion = np.max(expansions)
contraction = np.max(contractions)

print("Embedding Statistics:")
print("=====================")
print(f"Old Dimension: {sequence_length}, New Dimension: {embedding.m}")
print(f"Expansion: {expansion}, Contraction: {contraction}")
print("Distortion: ", expansion*contraction)

Embedding Statistics:
=====================
Old Dimension: 200, New Dimension: 7000
Expansion: 121.65237319510048, Contraction: 0.011089042951659453
Distortion:  1.3490083915317743

# Perform multidimensional scaling and plot the old and new distances, side by side
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

dpr = embedding.get_distance_matrix()

mdsEmbed = MDS(n_components=2, dissimilarity='precomputed')
mdsOld = mdsEmbed.fit_transform(d)
mdsNew = mdsEmbed.fit_transform(dpr)

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].scatter(mdsOld[:, 0], mdsOld[:, 1])
axs[0].set_title("MDS of Hamming Distances")
axs[1].scatter(mdsNew[:, 0], mdsNew[:, 1])
axs[1].set_title("MDS of Embedded Distances (L2 norm)")
plt.show()

Bourgain Embedding: An Introduction¶

Background¶

Maps¶

Embeddings and Distortions¶

Bourgain Embedding¶

Testing the Bourgain embedding¶

Example: DNA space with the Hamming metric¶