test-embedding.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import numpy as np
  2. import sklearn.decomposition
  3. import pickle
  4. import time
  5. # Apply 'Algorithm 1' to the ada-002 embeddings to make them isotropic, taken from the paper:
  6. # ALL-BUT-THE-TOP: SIMPLE AND EFFECTIVE POST- PROCESSING FOR WORD REPRESENTATIONS
  7. # Jiaqi Mu, Pramod Viswanath
  8. # This uses Principal Component Analysis (PCA) to 'evenly distribute' the embedding vectors (make them isotropic)
  9. # For more information on PCA, see https://jamesmccaffrey.wordpress.com/2021/07/16/computing-pca-using-numpy-without-scikit/
  10. # get the file pointer of the pickle containing the embeddings
  11. fp = open('/path/to/your/data/Embedding-Latest.pkl', 'rb')
  12. # the embedding data here is a dict consisting of key / value pairs
  13. # the key is the hash of the message (SHA3-256), the value is the embedding from ada-002 (array of dimension 1536)
  14. # the hash can be used to lookup the orignal text in a database
  15. E = pickle.load(fp) # load the data into memory
  16. # seperate the keys (hashes) and values (embeddings) into seperate vectors
  17. K = list(E.keys()) # vector of all the hash values
  18. X = np.array(list(E.values())) # vector of all the embeddings, converted to numpy arrays
  19. # list the total number of embeddings
  20. # this can be truncated if there are too many embeddings to do PCA on
  21. print(f"Total number of embeddings: {len(X)}")
  22. # get dimension of embeddings, used later
  23. Dim = len(X[0])
  24. # flash out the first few embeddings
  25. print("First two embeddings are: ")
  26. print(X[0])
  27. print(f"First embedding length: {len(X[0])}")
  28. print(X[1])
  29. print(f"Second embedding length: {len(X[1])}")
  30. # compute the mean of all the embeddings, and flash the result
  31. mu = np.mean(X, axis=0) # same as mu in paper
  32. print(f"Mean embedding vector: {mu}")
  33. print(f"Mean embedding vector length: {len(mu)}")
  34. # subtract the mean vector from each embedding vector ... vectorized in numpy
  35. X_tilde = X - mu # same as v_tilde(w) in paper
  36. # do the heavy lifting of extracting the principal components
  37. # note that this is a function of the embeddings you currently have here, and this set may grow over time
  38. # therefore the PCA basis vectors may change over time, and your final isotropic embeddings may drift over time
  39. # but the drift should stabilize after you have extracted enough embedding data to characterize the nature of the embedding engine
  40. print(f"Performing PCA on the normalized embeddings ...")
  41. pca = sklearn.decomposition.PCA() # new object
  42. TICK = time.time() # start timer
  43. pca.fit(X_tilde) # do the heavy lifting!
  44. TOCK = time.time() # end timer
  45. DELTA = TOCK - TICK
  46. print(f"PCA finished in {DELTA} seconds ...")
  47. # dimensional reduction stage (the only hyperparameter)
  48. # pick max dimension of PCA components to express embddings
  49. # in general this is some integer less than or equal to the dimension of your embeddings
  50. # it could be set as a high percentile, say 95th percentile of pca.explained_variance_ratio_
  51. # but just hardcoding a constant here
  52. D = 15 # hyperparameter on dimension (out of 1536 for ada-002), paper recommeds D = Dim/100
  53. # form the set of v_prime(w), which is the final embedding
  54. # this could be vectorized in numpy to speed it up, but coding it directly here in a double for-loop to avoid errors and to be transparent
  55. E_prime = dict() # output dict of the new embeddings
  56. N = len(X_tilde)
  57. N10 = round(N/10)
  58. U = pca.components_ # set of PCA basis vectors, sorted by most significant to least significant
  59. print(f"Shape of full set of PCA componenents {U.shape}")
  60. U = U[0:D,:] # take the top D dimensions (or take them all if D is the size of the embedding vector)
  61. print(f"Shape of downselected PCA componenents {U.shape}")
  62. for ii in range(N):
  63. v_tilde = X_tilde[ii]
  64. v = X[ii]
  65. v_projection = np.zeros(Dim) # start to build the projection
  66. # project the original embedding onto the PCA basis vectors, use only first D dimensions
  67. for jj in range(D):
  68. u_jj = U[jj,:] # vector
  69. v_jj = np.dot(u_jj,v) # scaler
  70. v_projection += v_jj*u_jj # vector
  71. v_prime = v_tilde - v_projection # final embedding vector
  72. v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
  73. E_prime[K[ii]] = v_prime
  74. if (ii%N10 == 0) or (ii == N-1):
  75. print(f"Finished with {ii+1} embeddings out of {N} ({round(100*ii/N)}% done)")
  76. # save as new pickle
  77. print("Saving new pickle ...")
  78. embeddingName = '/path/to/your/data/Embedding-Latest-Isotropic.pkl'
  79. with open(embeddingName, 'wb') as f: # Python 3: open(..., 'wb')
  80. pickle.dump([E_prime,mu,U], f)
  81. print(embeddingName)
  82. print("Done!")
  83. # When working with live data with a new embedding from ada-002, be sure to tranform it first with this function before comparing it
  84. #
  85. def projectEmbedding(v,mu,U):
  86. v = np.array(v)
  87. v_tilde = v - mu
  88. v_projection = np.zeros(len(v)) # start to build the projection
  89. # project the original embedding onto the PCA basis vectors, use only first D dimensions
  90. for u in U:
  91. v_jj = np.dot(u,v) # scaler
  92. v_projection += v_jj*u # vector
  93. v_prime = v_tilde - v_projection # final embedding vector
  94. v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
  95. return v_prime