Intro to spaCy

April 17, 2019

Main website is at https://spacy.io/

You may also want to check out the company behind spaCy – Explosion AI

The best way to get off the ground is to head over to the Usage page and start from the beginning.

After you are off the ground, head over to spaCy 101 and follow the tutorial.

Notebooks

View the IPython notebook for this session on Github here

Or launch the notebook in Google Colab or MyBinder:

Google Colab
Binder

Code

!pip install -U spacy
!python -m spacy download en_core_web_sm
import spacy

# spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)
Apple PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
Apple Apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
!python -m spacy download en_core_web_lg
Collecting en_core_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz#egg=en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
    100% |████████████████████████████████| 826.9MB 54.1MB/s
[?25hInstalling collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core-web-lg-2.1.0
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

# nlp = spacy.load('en_core_web_lg')
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print("apple <-> banana", apple.similarity(banana))
print("pasta <-> hippo", pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)
apple <-> banana 0.5831845
pasta <-> hippo 0.079349115
True True True True
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
          u"in 2007, few people outside of the company took him seriously.")

dep_labels = []
for token in doc:
    while token.head != token:
        dep_labels.append(token.dep_)
        token = token.head
print(dep_labels)
['advmod', 'advcl', 'compound', 'nsubj', 'advcl', 'nsubj', 'advcl', 'advcl', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'npadvmod', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'amod', 'pobj', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'prep', 'xcomp', 'advcl', 'pobj', 'prep', 'xcomp', 'advcl', 'punct', 'amod', 'nsubj', 'nsubj', 'advmod', 'nsubj', 'prep', 'advmod', 'nsubj', 'det', 'pobj', 'prep', 'advmod', 'nsubj', 'pobj', 'prep', 'advmod', 'nsubj', 'dobj', 'advmod', 'punct']

Word Movers Distance

!pip install wmd
Collecting wmd
[?25l  Downloading https://files.pythonhosted.org/packages/2f/61/686d4dd4f2e37fea15b3bd04a5b68a74aa2cb54be18a31f59d5703991f0b/wmd-1.3.0.tar.gz (103kB)
    100% |████████████████████████████████| 112kB 2.7MB/s
[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from wmd) (1.16.2)
Building wheels for collected packages: wmd
  Building wheel for wmd (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/cb/ce/ec/b1bb6b19efe311c995ef7467d299db6d12392bb08456283e92
Successfully built wmd
Installing collected packages: wmd
Successfully installed wmd-1.3.0
import spacy
import wmd

import en_core_web_md
nlp = en_core_web_md.load()
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)
doc1 = nlp("Politician speaks to the media in Illinois.")
doc2 = nlp("The president greets the press in Chicago.")
doc3 = nlp("I do not like green eggs and ham.")
print(doc1.similarity(doc2))
print(doc1.similarity(doc3))
print(doc1.similarity(doc1))

Visualization

Note – this doesn’t seem to work in Google Colaboratory

from spacy import displacy

displacy.serve(doc, style="dep")
Shutting down server on port 5000.
displacy.serve(doc, style="ent")

Fun with real data

Let’s grab some text from the 20 newsgroups dataset and play around with some text written by real people (before bots)

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
len(newsgroups_train.data)
11314
from pprint import pprint
pprint(list(newsgroups_train.target_names))
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

https://disqus.com/embed/comments/?base=default&f=hsv-ai&t_u=https%3A%2F%2Fhsv-ai.com%2Fmeetups%2F190417_spacy%2F&t_d=Huntsville%20AI%20%7C%20Intro%20to%20spaCy&t_t=Huntsville%20AI%20%7C%20Intro%20to%20spaCy&s_o=default#version=46aa6ce1907927200257678d09dec282