問題

我嘗試從這個專案使用此命令

 python create_text_folder.py -i Apache -o output_dir
 

我在檔案mailcorcus.json中使用這個json資料作為輸入

 [
   {
      "id":12,
      "mailing_list_url":"12",
      "type_of_recipient":"before",
      "email_address":"[email protected]",
      "message_body":"Here is one text to test sentiment and feel happy",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"before",
      "email_address":"[email protected]",
      "message_body":"Here is one text to test sentiment and feel happy and feel fine",
      "is_response_of":"before"
   },
   {
      "id":21,
      "mailing_list_url":"21",
      "type_of_recipient":"after",
      "email_address":"[email protected]",
      "message_body":"Not feel so good for this code",
      "is_response_of":"after"
   }
]
 

命令的第一部分是這段程式碼

 import rpy2.robjects as robjects
from bs4 import BeautifulSoup as BS4
from rpy2.robjects.packages import importr
import json
from email_reply_parser import EmailReplyParser

'''
NLoN training
'''
def training_nlon():
    nlon = importr('NLoN')
    #Path to NLoN training data
    robjects.r['load']('data/training_data.rda')

    return nlon, nlon.NLoNModel(robjects.r['text'], robjects.r['rater'])
'''
Gets mail corpus from email addresses
'''
def get_mail_corpus(nlon_cleaning=False):
    if (nlon_cleaning):
        nlon, nlon_model = training_nlon()

    #Path to mail's corpus
    corpus_file = 'data/mailcorpus.json'
    with open(corpus_file) as data_file:
        corpus = json.load(data_file)

    print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus)))
    dict = {}
    n = 0
    #Text cleaning
    for d in corpus:
        if d['type_of_recipient'] == 'From':
            # if not d['is_response_of'] == None:
            res = EmailReplyParser.read(d['message_body'].replace('\n', '
'))
            text = res.reply
            # else:
            #     text = d['message_body'].replace('\n', '
')
            n += 1

            if (nlon_cleaning):
                try:
                    soup = BS4(text, 'html.parser')
                    clean_message_body = soup.text
                except Exception as e:
                    print('Error with BS4 on text:

%s

' % text, str(e))
                    clean_message_body = text.strip()
                message_by_lines = text.splitlines()
                list_length = len(message_by_lines)
                index = 0
                for count in range(0, list_length):
                    text1 = robjects.StrVector([message_by_lines[index]])
                    if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not':
                        del message_by_lines[index]
                    else:
                        index = index + 1
                clean_message_body = '
'.join(message_by_lines)
                text = clean_message_body

            if not text == '':
                if d['email_address'] in dict:
                    dict[d['email_address']].append(text)
                else:
                    dict[d['email_address']] = [text]
        print(str(n)+'/'+str(len(corpus))+'
' if n%50==0 else '', end='')

    print('Mails retrieved: '+ str(n))
    print('Email addresses: '+ str(len(dict)))
    return dict
 

它可以從github儲存庫連結我不擁有它.

第二部分是以下程式碼:

 import MailCorpus as mc
import sys, getopt, os
import csv

def main(argv):
    input_dataset = ''
    output_dir = ''
    dataset_path = ''
    nlon_cleaning = False
    try:
        opts, args = getopt.getopt(argv,"hi:o:p:nlon",["inputdataset=","outputdir=","datasetpath="])
    except getopt.GetoptError:
        print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h' or opt=='-help':
            print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
            sys.exit()
        elif opt in ("-nlon"):
            nlon_cleaning = True
        elif opt in ("-i", "--inputdataset"):
            input_dataset = arg
        elif opt in ("-o", "--outputdir"):
            output_dir = arg
        elif opt in ("-p", "--datasetpath"):
            dataset_path = arg

    print('Dataset: '+ str(input_dataset))
    print('NLoN: '+ str(nlon_cleaning))
    print('Dataset path: ' + dataset_path)
    print('Output directory: ' + output_dir)

    dict = {}
    if input_dataset == 'Apache':
        dict = mc.get_mail_corpus(nlon_cleaning)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for k in dict.keys():
            text = '
'.join(dict[k])
            with open(output_dir + '/' + str(k) + '.txt', "w") as text_file:
                print(text, file=text_file)

    else:
        if dataset_path == '':
            print('Wrong input dataset')
            print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
            sys.exit()

        else:
            if input_dataset == 'LIWC':
                # Path to liwc gold standard
                # header is 'ID,text,cEXT,cNEU,cAGR,cCON,cOPN'

                with open(dataset_path, encoding='cp1252') as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter=',')
                    for row in csv_reader:
                        dict[row[0]] = row[1]

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                for k in dict.keys():
                    with open(output_dir + '/' + str(k), "w") as text_file:
                        print(dict[k], file=text_file)

            else:
                print('Wrong input dataset')
                print('create_text_folder.py -i Apache|LIWC [-nlon] [-p <dataset_path>] -o <output_dir>')
                sys.exit()


if __name__ == "__main__":
   main(sys.argv[1:])
 

此程式碼再次來自github儲存庫

它似乎有效,但我在命令輸出中收到它

 $ python create_text_folder.py -i Apache -o output_dir
Dataset: Apache
NLoN: False
Dataset path:
Output directory: output_dir
Reading and cleaning emails corpus. Number of emails: 3
0/3
0/3
0/3
Mails retrieved: 0
Email addresses: 0
 

我認為json格式有問題,因為它理解有3個物件,但不會進入程序,在output_dir中沒有任何檔案

我能做什麼?

  最佳答案

我看到你的json中有小錯誤,並糾正它們,這是你可以使用的結果:

[ { "ID":12, “mail_list_url”:“12” “type_of_recurity”:“之前”, “email_address”:“[email protected]” “message_body”:“這是一個測試情緒和感覺快樂的文字,” “is_response_of”:“之前” }, { "ID":21, “mail_list_url”:“21” “type_of_recurity”:“之前”, “email_address”:“[email protected]” “message_body”:“這是一個測試情緒和感覺快樂和感覺良好的文字,” “is_response_of”:“之前” }, { "ID":"21" “mail_list_url”:“21” “type_of_recurity”:“之後”, “email_address”:“[email protected]” “message_body”:“不覺得這段程式碼太好了,” “is_response_of”:“之後” } [原件:英文]

如果您的程式碼仍然無法完成您想要的任務,您可以在下面新增一個評論,在此之後我將對您的程式碼進行干擾. 乾杯

  相同標籤的其他問題

python