If you’re looking for a way to use Gensim to setup a doc2vec model, I found the following works rather well for my use case.

from gensim.models.doc2vec import LabeledSentence

from os import listdir

from os.path import isfile, join

import gensim

import DocIterator as DocIt

docLabels = []

docLabels = [f for f in listdir(“/Users/justin/DeepLearning/suck/GBP_USD/train/neu”) if f.endswith(‘.txt’)]

data = []

for doc in docLabels:

with open(“/Users/justin/DeepLearning/suck/GBP_USD/train/neu/” + doc, ‘r’) as f:

data.append(f.read())

it = DocIt.DocIterator(data, docLabels)

model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=3,alpha=0.04, min_alpha=0.005) # use fixed learning rate

model.build_vocab(it)

for epoch in range(100):

print(“Epoch “ + str(epoch))

model.train(it)

print(model.docvecs.most_similar([“6605c7c39fc7d99889fc047488dc9e33.txt”], topn=10))

model.alpha -= 0.002 # decrease the learning rate

print(model.alpha)

model.min_alpha = model.alpha # fix the learning rate, no deca

model.train(it)

print(model.docvecs.most_similar([“6605c7c39fc7d99889fc047488dc9e33.txt”], topn=10))

model.save(“doc2vec.model”)

Like what you read? Give Justin Davies a round of applause.

From a quick cheer to a standing ovation, clap to show how much you enjoyed this story.