-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-quanteda-readtext_test_telechargement.Rmd
87 lines (72 loc) · 2.91 KB
/
01-quanteda-readtext_test_telechargement.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
---
title: "quanteda.io:readtext_test"
author: "C.R."
date: "05/10/2019"
output: html_document
---
Test of the Quanteda library familly on french corpus
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(quanteda)
library(cld3)
library(jsonlite)
```
## Extract text from download directory
```{r cars}
library(readtext)
DATA_DIR <- "~/Téléchargements"
# read in all files from a folder
texts <-readtext(file=paste0(DATA_DIR, "/*.pdf"), docvarsfrom = "metadata",verbosity = 2)
# Reading texts from ~/Download/*.pdf
# PDF error: Invalid Font Weight
#...
# PDF error: Could not parse ligature component "folder" of "folder_close_alt" in parseCharName
# PDF error: Could not parse ligature component "close" of "folder_close_alt" in parseCharName
# PDF error: Could not parse ligature component "level" of "level_down" in parseCharName
# PDF error: Could not parse ligature component "down" of "level_down" in parseCharName
# ... read 108 documents.
# texts
# readtext object consisting of 108 documents and 0 docvars.
# # Description: df[,2] [108 × 2]
# doc_id text
# <chr> <chr>
# 1 [email protected] "\" \"..."
# 2 1703_WhyDoWeVisualiseData.pdf "\"Lisa Charl\"..."
# 3 1910012156_TL-MR3020(EU)_V3_UG.pdf "\"User Guide\"..."
# TODO de-duplicate docs fails
# corpus <- corpus(unique(x=texts, by="text"))
corpus <-corpus(texts)
save(texts, corpus, file = here::here("data/telechargement_corpus_fr.Rda"))
```
## corpus
```{r pressure, echo=FALSE}
summary(corpus)
```
# What is the Vanillia NER capability in spacy
## Step 1 Language detection
Are we using the right spacy model? i.e. wich one match `english` language ?
```{r}
texts <- texts %>% mutate(#text = str_squish(text), # DON'T text becomes illisible
text = str_trim(text), # fix doccano doing left trimming silently after annotation...
lang=detect_language(text))
corpus_fr <- corpus(texts %>% filter(lang=="fr"))
b <- texts(corpus_fr) %>% enframe
```
# Export fr text into json for doccano,....
```{r}
# ndjson_write function is missing, we use jsonlite::stream_out for the exact format doccano is expecting
a<- texts %>% filter(lang=="fr") %>% select(text, id=doc_id)
# TODO improvement : doc_id as id is lost through doccano. Any other doc_variable to go through ?
stream_out(a,file(here::here("data/telechargement_corpus_fr.json")))
```
# Export fr sentences into a json
```{r eval=FALSE, include=FALSE}
sentences<- corpus_fr %>% tokens(what="sentence") %>% unlist %>% enframe
save(sentences, file = here::here("data/discours_phrases_fr.json"))
```
# Export fr text unitary text files
```{r eval=FALSE, include=FALSE}
walk(a, ~write_file(.x, path = here::here(str_c("data/",uuid::UUIDgenerate(),".txt"))))
```
# Time to annotate with doccano/... container...