英文分词报错_CDA答疑社区

QQ图片20210324114041.png

from nltk.tokenize import word_tokenize

string=df['title'][0]

string

#输出结果为

'Economy Gains 227,000 Jobs in January Under President Trump - Breitbart'

word_tokenize(string)

报错提示为

---------------------------------------------------------------------------LookupError                               Traceback (most recent call last)<ipython-input-38-543e2cb9f883> in <module>----> 1 word_tokenize(string)C:\ProgramData\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in word_tokenize(text, language, preserve_line)    127     :type preserve_line: bool    128     """--> 129     sentences = [text] if preserve_line else sent_tokenize(text, language)    130     return [    131         token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)C:\ProgramData\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)    104     :param language: the model name in the Punkt corpus    105     """--> 106     tokenizer = load("tokenizers/punkt/{0}.pickle".format(language))    107     return tokenizer.tokenize(text)    108 C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)    750     751     # Load the resource.--> 752     opened_resource = _open(resource_url)    753     754     if format == "raw":C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in _open(resource_url)    875     876     if protocol is None or protocol.lower() == "nltk":--> 877         return find(path_, path + [""]).open()    878     elif protocol.lower() == "file":    879         # urllib might not use mode='rb', so handle this one ourselves:C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in find(resource_name, paths)    583     sep = "*" * 70    584     resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)--> 585     raise LookupError(resource_not_found)    586     587 LookupError: 
**********************************************************************
  Resource punkt not found.
  Please use the NLTK Downloader to obtain the resource:  >>> import nltk
  >>> nltk.download('punkt')  
  For more information see: https://www.nltk.org/data.html

  Attempted to load tokenizers/punkt/english.pickle

  Searched in:
    - 'C:\\Users\\chen/nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\chen\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************