123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- import numpy as np
- import sklearn.decomposition
- import pickle
- import time
- fp = open('/path/to/your/data/Embedding-Latest.pkl', 'rb')
- E = pickle.load(fp)
- K = list(E.keys())
- X = np.array(list(E.values()))
- print(f"Total number of embeddings: {len(X)}")
- Dim = len(X[0])
- print("First two embeddings are: ")
- print(X[0])
- print(f"First embedding length: {len(X[0])}")
- print(X[1])
- print(f"Second embedding length: {len(X[1])}")
- mu = np.mean(X, axis=0)
- print(f"Mean embedding vector: {mu}")
- print(f"Mean embedding vector length: {len(mu)}")
- X_tilde = X - mu
- print(f"Performing PCA on the normalized embeddings ...")
- pca = sklearn.decomposition.PCA()
- TICK = time.time()
- pca.fit(X_tilde)
- TOCK = time.time()
- DELTA = TOCK - TICK
- print(f"PCA finished in {DELTA} seconds ...")
- D = 15
- E_prime = dict()
- N = len(X_tilde)
- N10 = round(N/10)
- U = pca.components_
- print(f"Shape of full set of PCA componenents {U.shape}")
- U = U[0:D,:]
- print(f"Shape of downselected PCA componenents {U.shape}")
- for ii in range(N):
- v_tilde = X_tilde[ii]
- v = X[ii]
- v_projection = np.zeros(Dim)
-
- for jj in range(D):
- u_jj = U[jj,:]
- v_jj = np.dot(u_jj,v)
- v_projection += v_jj*u_jj
- v_prime = v_tilde - v_projection
- v_prime = v_prime/np.linalg.norm(v_prime)
- E_prime[K[ii]] = v_prime
- if (ii%N10 == 0) or (ii == N-1):
- print(f"Finished with {ii+1} embeddings out of {N} ({round(100*ii/N)}% done)")
- print("Saving new pickle ...")
- embeddingName = '/path/to/your/data/Embedding-Latest-Isotropic.pkl'
- with open(embeddingName, 'wb') as f:
- pickle.dump([E_prime,mu,U], f)
- print(embeddingName)
- print("Done!")
- def projectEmbedding(v,mu,U):
- v = np.array(v)
- v_tilde = v - mu
- v_projection = np.zeros(len(v))
-
- for u in U:
- v_jj = np.dot(u,v)
- v_projection += v_jj*u
- v_prime = v_tilde - v_projection
- v_prime = v_prime/np.linalg.norm(v_prime)
- return v_prime
|