annoy学习总结

2023-09-25 13 0

测试使用annoy案例1

from annoy import AnnoyIndex
import randomf = 40
t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i in range(1000):v = [random.gauss(0, 1) for z in range(f)]t.add_item(i, v)t.build(10) # 10 trees
t.save('test.ann')# ...u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 10)) # will find the 1000 nearest neighbors

结果如下
在这里插入图片描述

测试使用annoy案例2

a = AnnoyIndex(3, 'euclidean')
b = AnnoyIndex(3 ,'angular')a.add_item(0,[1,0,0])
a.add_item(1,[0,1,0])
a.add_item(2,[2,0,0])
a.add_item(3,[2.5,0,0])
a.add_item(4,[1,0,0.5])b.add_item(0,[1,0,0]) 
b.add_item(1,[0,1,0])
b.add_item(2,[2,0,0])
b.add_item(3,[2.5,0,0])
b.add_item(4,[1,0,0.5])a.build(1)
b.build(1)print(a.get_nns_by_item(0, 4))
print(b.get_nns_by_item(0, 4))

在这里插入图片描述

普通实现的MNN测试(欧式距离)

#主要的判断方式如下
import numpy as np
from sklearn.neighbors import NearestNeighbors
#import pyreadr
import numpy as np
# from sklearn.neighbors import NearestNeighbors
# x=pyreadr.read_r("x.RData")
# y=pyreadr.read_r("y.RData")
# x=x["x"].values
# y=y["y"].values
np.random.seed(1)#设置随机种子
x=np.random.randn(10,2)# x是二维的
y=np.random.randn(10,2)# y是2维的
#x ndarray,y ndarray
# return ndarray ,先默认是欧式距离,
def findMNN(x,y,k=10):neigh_y = NearestNeighbors(n_neighbors=k).fit(y)indice_y=neigh_y.kneighbors(x, return_distance=False)#对数据集x,在y中找它的k最近邻,返回下标neigh_x = NearestNeighbors(n_neighbors=k).fit(x)cnt=0;mnnset=[]for ind_y in indice_y:temp=y[ind_y]indice_x=neigh_x.kneighbors(temp,return_distance=False)row,col=np.where(indice_x==cnt)for temp_y in row:mnnset.append([cnt,ind_y[temp_y]])#mnnset.add((cnt,ind_y[temp_y]))cnt=cnt+1mnn_indice=np.array(mnnset)# 我不想返回indiceprint(mnn_indice)# 此处要不要返回下表#eturn(x[mnnset])#return(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#目前是返回元组,我觉得我直接返回矩阵算了# 就是res[0]和res[1]返回的矩阵进行拼接。如果res[0]是5维的,res[1]是5维的,那么合并后就是10维的res=(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#这个是元组形式return(np.concatenate((res[0],res[1]),axis=1))# 直接返回了
#首先返回的集合mnn pair的所有集合#给定anchor_sample,positive_sample,集合x,y,判断这一个样本对是否是mnn pair,我觉得是不是应该直接改成向量的
def quary_xy_mnn(anchor_sample,positive_sample,set_x,set_y,k=20):res=findMNN(set_x,set_y,k=k);temp_test=np.concatenate((anchor_sample,positive_sample),axis=0)#这个是一维的,所以不存在axis=1,return(any((res==temp_test).all(1)))#判断该元素在不在里面#使用案例,
#i=0;
#j=0;
#print(quary_xy_mnn(x[i],y[j],x,y))
mnnset=findMNN(x,y,k=2)# 这里是欧式距离找的mnn,如果用别的距离不知道可不可以,这个目前是没有什么问题的
#print(mnnset)
# 整体的数据图
import matplotlib.pyplot as plt
plt.figure(figsize=(18,12))
plt.scatter(x[:,0],x[:,1],color="r",s=100)
plt.scatter(y[:,0],y[:,1],color="g",s=100)for i in range(x.shape[0]):plt.text(x[i,0], x[i,1], str(i),fontsize=20)
for i in range(y.shape[0]):plt.text(y[i,0], y[i,1], str(i),fontsize=20)    def connectpoints(x,y,p1,p2):# 现在仅仅画两个点x1, x2 = x[p1], x[p2]y1, y2 = y[p1], y[p2]plt.scatter(x1,y1,color='r',s=150)plt.scatter(x2,y2,color="g",s=150)plt.plot([x1,x2],[y1,y2])for i in range(len(mnnset)):x=[mnnset[i,0],mnnset[i,2]]y=[mnnset[i,1],mnnset[i,3]]connectpoints(x,y,0,1)
#plt.axis('equal')
plt.show()

在这里插入图片描述
在这里插入图片描述

使用annoy计算(欧式距离)

#from typing import Final
from annoy import AnnoyIndex
KNN = 1
#Exact nearest neighbors search.
def nn(ds1, ds2, knn=KNN, metric_p=2):# Find nearest neighbors of first dataset.nn_ = NearestNeighbors(n_neighbors=knn, p=metric_p)nn_.fit(ds2)ind = nn_.kneighbors(ds1, return_distance=False)match = set()for a, b in zip(range(ds1.shape[0]), ind):for b_i in b:match.add((a, b_i))return match# Approximate nearest neighbors using locality sensitive hashing.
def nn_approx(ds1, ds2, knn=KNN, metric='euclidean', n_trees=10):# Build index.a = AnnoyIndex(ds2.shape[1], metric=metric)for i in range(ds2.shape[0]):a.add_item(i, ds2[i, :])a.build(n_trees)# Search index.ind = []for i in range(ds1.shape[0]):ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))ind = np.array(ind)# Match.match = set()for a, b in zip(range(ds1.shape[0]), ind):for b_i in b:match.add((a, b_i))return match# Find mutual nearest neighbors.
def mnn(ds1, ds2, knn=KNN, approx=True):# Find nearest neighbors in first direction.if approx:match1 = nn_approx(ds1, ds2, knn=knn)else:match1 = nn(ds1, ds2, knn=knn)# Find nearest neighbors in second direction.if approx:match2 = nn_approx(ds2, ds1, knn=knn)else:match2 = nn(ds2, ds1, knn=knn)# Compute mutual nearest neighbors.mutual = match1 & set([ (b, a) for a, b in match2 ])return mutualnp.random.seed(1)#设置随机种子
x=np.random.randn(20,2)# x是二维的
y=np.random.randn(20,2)# y是2维的
mnn_appro=mnn(x,y,knn=2,approx=True)
mnn_precise=mnn(x,y,knn=2,approx=False)
print("=========近似寻找一共找到{}个========".format(len(mnn_appro)))
print(mnn_appro)
print("=========精确寻找一共找到{}个========".format(len(mnn_precise)))
print(mnn_precise)
print("=========两种一共交集有{}个==========".format(len(mnn_appro & mnn_precise)))
#print(len(mnn_appro & mnn_precise))# 可以看到这个是一致的,现在怎么返回距离的问题。

在这里插入图片描述

测试cosine距离

#主要的判断方式如下
import numpy as np
from sklearn.neighbors import NearestNeighbors
#import pyreadr
import numpy as np
# from sklearn.neighbors import NearestNeighbors
# x=pyreadr.read_r("x.RData")
# y=pyreadr.read_r("y.RData")
# x=x["x"].values
# y=y["y"].values
np.random.seed(1)#设置随机种子
x=np.random.randn(100,2)# x是二维的
y=np.random.randn(100,2)# y是2维的
#x ndarray,y ndarray
# return ndarray ,先默认是欧式距离,
def findMNN(x,y,k=100):neigh_y = NearestNeighbors(n_neighbors=k,metric="cosine").fit(y)indice_y=neigh_y.kneighbors(x, return_distance=False)#对数据集x,在y中找它的k最近邻,返回下标neigh_x = NearestNeighbors(n_neighbors=k,metric="cosine").fit(x)cnt=0;mnnset=[]mnn_self_imple=set()for ind_y in indice_y:temp=y[ind_y]indice_x=neigh_x.kneighbors(temp,return_distance=False)row,col=np.where(indice_x==cnt)for temp_y in row:mnnset.append([cnt,ind_y[temp_y]])mnn_self_imple.add((cnt,ind_y[temp_y]))#mnnset.add((cnt,ind_y[temp_y]))cnt=cnt+1mnn_indice=np.array(mnnset)# 我不想返回indice#print(mnn_indice)# 此处要不要返回下表#eturn(x[mnnset])#return(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#目前是返回元组,我觉得我直接返回矩阵算了# 就是res[0]和res[1]返回的矩阵进行拼接。如果res[0]是5维的,res[1]是5维的,那么合并后就是10维的res=(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#这个是元组形式return(np.concatenate((res[0],res[1]),axis=1),mnn_self_imple)# 直接返回了
#首先返回的集合mnn pair的所有集合#给定anchor_sample,positive_sample,集合x,y,判断这一个样本对是否是mnn pair,我觉得是不是应该直接改成向量的
def quary_xy_mnn(anchor_sample,positive_sample,set_x,set_y,k=20):res=findMNN(set_x,set_y,k=k);temp_test=np.concatenate((anchor_sample,positive_sample),axis=0)#这个是一维的,所以不存在axis=1,return(any((res==temp_test).all(1)))#判断该元素在不在里面#使用案例,
#i=0;
#j=0;
#print(quary_xy_mnn(x[i],y[j],x,y))
mnnset,mnn_self_imple=findMNN(x,y,k=2)# 这里是欧式距离找的mnn,如果用别的距离不知道可不可以,这个目前是没有什么问题的
#print(mnnset)
# 整体的数据图
import matplotlib.pyplot as plt
plt.figure(figsize=(18,12))
plt.scatter(x[:,0],x[:,1],color="r",s=100)
plt.scatter(y[:,0],y[:,1],color="g",s=100)for i in range(x.shape[0]):plt.text(x[i,0], x[i,1], str(i),fontsize=20)
for i in range(y.shape[0]):plt.text(y[i,0], y[i,1], str(i),fontsize=20)    def connectpoints(x,y,p1,p2):# 现在仅仅画两个点x1, x2 = x[p1], x[p2]y1, y2 = y[p1], y[p2]plt.scatter(x1,y1,color='r',s=150)plt.scatter(x2,y2,color="g",s=150)plt.plot([x1,x2],[y1,y2])for i in range(len(mnnset)):x=[mnnset[i,0],mnnset[i,2]]y=[mnnset[i,1],mnnset[i,3]]connectpoints(x,y,0,1)
#plt.axis('equal')
plt.show()

结果如下
在这里插入图片描述

使用annoy来计算cosine距离

#from typing import Final
from annoy import AnnoyIndex
KNN = 1
#Exact nearest neighbors search.
def nn(ds1, ds2, knn=KNN, metric_p="cosine"):# Find nearest neighbors of first dataset.nn_ = NearestNeighbors(n_neighbors=knn, metric=metric_p)nn_.fit(ds2)ind = nn_.kneighbors(ds1, return_distance=False)match = set()for a, b in zip(range(ds1.shape[0]), ind):for b_i in b:match.add((a, b_i))return match# Approximate nearest neighbors using locality sensitive hashing.
def nn_approx(ds1, ds2, knn=KNN, metric='angular', n_trees=10):# Build index.a = AnnoyIndex(ds2.shape[1], metric=metric)for i in range(ds2.shape[0]):a.add_item(i, ds2[i, :])a.build(n_trees)# Search index.ind = []for i in range(ds1.shape[0]):ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))ind = np.array(ind)# Match.match = set()for a, b in zip(range(ds1.shape[0]), ind):for b_i in b:match.add((a, b_i))return match# Find mutual nearest neighbors.
def mnn(ds1, ds2, knn=KNN, approx=True):# Find nearest neighbors in first direction.if approx:match1 = nn_approx(ds1, ds2, knn=knn)else:match1 = nn(ds1, ds2, knn=knn)# Find nearest neighbors in second direction.if approx:match2 = nn_approx(ds2, ds1, knn=knn)else:match2 = nn(ds2, ds1, knn=knn)# Compute mutual nearest neighbors.mutual = match1 & set([ (b, a) for a, b in match2 ])return mutualnp.random.seed(1)#设置随机种子
x=np.random.randn(100,2)# x是二维的
y=np.random.randn(100,2)# y是2维的
mnn_appro=mnn(x,y,knn=2,approx=True)
mnn_precise=mnn(x,y,knn=2,approx=False)
print("=========近似寻找一共找到{}个========".format(len(mnn_appro)))
print(mnn_appro)
print("=========精确寻找一共找到{}个========".format(len(mnn_precise)))
print(mnn_precise)
print("=========自己实现一共找到{}个========".format(len(mnn_self_imple)))
print(mnn_self_imple)
print("=========三种一共交集有{}个==========".format(len(mnn_appro & mnn_precise & mnn_self_imple)))
#print(len(mnn_appro & mnn_precise))# 可以看到这个是一致的,现在怎么返回距离的问题。

在这里插入图片描述

代码编程
赞赏

相关文章

【原创】基于SSM的超市进销存管理系统(超市管理系统毕业设计)
手机进销存系统
Z5NTS功能之icmp
Flink原理与调优
软件工程课堂作业(七)——电梯调度之需求规格说明书
软件工程课堂训练——电梯调度问题需求分析