April 17, 2019
Main website is at https://spacy.io/
You may also want to check out the company behind spaCy – Explosion AI
The best way to get off the ground is to head over to the Usage page and start from the beginning.
After you are off the ground, head over to spaCy 101 and follow the tutorial.
Notebooks
View the IPython notebook for this session on Github here
Or launch the notebook in Google Colab or MyBinder:
Code
!pip install -U spacy
!python -m spacy download en_core_web_sm
import spacy
# spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
print(token.text, token.pos_, token.dep_)
Apple PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
Apple Apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
!python -m spacy download en_core_web_lg
Collecting en_core_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz#egg=en_core_web_lg==2.1.0
[?25l Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K 100% |████████████████████████████████| 826.9MB 54.1MB/s
[?25hInstalling collected packages: en-core-web-lg
Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core-web-lg-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()
# nlp = spacy.load('en_core_web_lg')
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
print("apple <-> banana", apple.similarity(banana))
print("pasta <-> hippo", pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)
apple <-> banana 0.5831845
pasta <-> hippo 0.079349115
True True True True
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
u"in 2007, few people outside of the company took him seriously.")
dep_labels = []
for token in doc:
while token.head != token:
dep_labels.append(token.dep_)
token = token.head
print(dep_labels)
['advmod', 'advcl', 'compound', 'nsubj', 'advcl', 'nsubj', 'advcl', 'advcl', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'npadvmod', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'nsubj', 'nsubj', 'advmod', 'nsubj', 'prep', 'advmod', 'nsubj', 'det', 'pobj', 'prep', 'advmod', 'nsubj', 'pobj', 'prep', 'advmod', 'nsubj', 'dobj', 'advmod', 'punct']
Word Movers Distance
!pip install wmd
Collecting wmd
[?25l Downloading https://files.pythonhosted.org/packages/2f/61/686d4dd4f2e37fea15b3bd04a5b68a74aa2cb54be18a31f59d5703991f0b/wmd-1.3.0.tar.gz (103kB)
[K 100% |████████████████████████████████| 112kB 2.7MB/s
[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from wmd) (1.16.2)
Building wheels for collected packages: wmd
Building wheel for wmd (setup.py) ... [?25ldone
[?25h Stored in directory: /root/.cache/pip/wheels/cb/ce/ec/b1bb6b19efe311c995ef7467d299db6d12392bb08456283e92
Successfully built wmd
Installing collected packages: wmd
Successfully installed wmd-1.3.0
import spacy
import wmd
import en_core_web_md
nlp = en_core_web_md.load()
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)
doc1 = nlp("Politician speaks to the media in Illinois.")
doc2 = nlp("The president greets the press in Chicago.")
doc3 = nlp("I do not like green eggs and ham.")
print(doc1.similarity(doc2))
print(doc1.similarity(doc3))
print(doc1.similarity(doc1))
Visualization
Note – this doesn’t seem to work in Google Colaboratory
from spacy import displacy
displacy.serve(doc, style="dep")
Shutting down server on port 5000.
displacy.serve(doc, style="ent")
Fun with real data
Let’s grab some text from the 20 newsgroups dataset and play around with some text written by real people (before bots)
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
len(newsgroups_train.data)
11314
from pprint import pprint
pprint(list(newsgroups_train.target_names))
['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']