I am trying to extract data (tables) from pdf files and store them as data frames.
library(pdftools)
library(tabulizerjars)
library(tabulizer)
library(tidyverse)
f <- file.path("D:/Araratbank/Statement USD-pages-1.pdf")
#using pdf tools package
text <- pdf_text(f)
text
#> [1] "´³ÝϳÛÇÝ·³ÕïÝÇù +\r\n γï³ñáÕ`\r\n îå»ó` سñ·³ñ۳ݲÝݳèáµ»ñïÇF226 17/12/19 13:45:39\r\n ø²Ôì²Ìø´²ÜβÚÆÜвÞìÆò\r\n ïñ³Ù³¹ñٳݳÙë³ÃÇíÁ 17/12/19 13:46:16\r\n ´³ÝÏ AM24149, ÚáõÝǵ³ÝÏäñÇí»Ù³ëݳ×ÛáõÕ\r\n г׳Ëáñ¹Ç³ÝáõÝÁ/³Ýí³ÝáõÙÁ§²¸²ØÆàôئêäÀ\r\n гëó»Ð²Ú²êî²Üºñ¨³ÝèáõµÇÝÛ³Ýó÷áÕ. 21/3-19\r\n г׳Õáñ¹ÇѳßíÇѳٳñÁ/²ñÅáõÛÃÁ 24149000206001 USD\r\n ø³Õí³ÍùÇѳٳñ\r\n ܳËáñ¹ù³Õí³ÍùÇÓ¨³íáñٳݳÙë³ÃÇí 01/09/19\r\n êϽµÝ³Ï³ÝÙݳóáñ¹ 01/09/19 CR USD 358,048.19\r\n F226 --1\r\n²Ùë³ÃÇíö³ëï³ÃÕÃÇö³ëï³ÃÕÃǶáõÙ³ñ DB/ êï³óáÕÇ/ ì׳ñáÕÇêï³óáÕÇ/ ì׳ñáÕÇêï³óáÕÇ/ í׳ñáÕÇÜå³ï³ÏÁ\r\n ѳٳñÑÕÙ³ÝѳٳñÁ CR ѳßíÇѳٳñ³ÝáõÝÁ/³Ýí³ÝáõÙÁµ³ÝÏ\r\n PEPSICO HOLDINGS LLC BLICRUMM / HSBC BANK INVOICE 03/00362660-19 DD 07.08.19A CC. TO\r\n 02/09/19 190902021464049 190902049382049 7,336.83 DB 38410000000213 141580,RU SSIA,MOSCOW (RR) OOO CONTRACT N PS/AD 001/02-18D D 14.02.18\r\n SANDORA LTD 57262, CITIUAUK / CITIBANK INV 32015 DD 06.08.19 ACC. TO CONT RACT N\r\n 02/09/19 190902021461049 190902049391049 12,260.20 DB 38410000000213 UKRAINA, N IKOLAEVSKAYA (UKRAINE) S-19-3972 DD 01.06.2019 FOR NATURAL\r\n JSC PERMALKO, AVTBRUMMXXX / URALSIB INVOICE 255 DD 03.09.19 ACC. TO C\r\n 03/09/19 190903041599049 190903047747049 20,082.24 DB 38410000000213 RUSSIA,614990,G.PERM, BANK OAO ONTRACT N282-15 DTD. 16.09.2015 FO R\r\n OOO RODNIK I K AVTBRUMMXXX / URALSIB INVOICES 184-190 DD 20.08.19 ACC . TO\r\n 03/09/19 190903041597049 190903047761049 93,139.20 DB 38410000000213 RUSSIA,MOSKOVSKA YA BANK OAO CONTRACT N62-M DD 10.05.2016F OR\r\n GLOBAL SPIRITS GROUP MUNIUA22 / TASCOMBANK INVOICES 18,19 DD 23.08.19 ACC. TOC\r\n 03/09/19 190903041591049 190903047819049 41,015.88 DB 38410000000213 LLC 12 VYACHESLAV JSC (FORMERLY BANK ONTRACT N 06/2019-A DD 13.07.19 FOR\r\n ABRAHAM JACOBI- THE RZBAATWW RAIFFEISEN\r\n 04/09/19 ASW07394/040919 190904088136000 14,307.58 CR 38410000000197 BEER STORE 3-22 S.Y. BANK INTERNATIONAL AG\r\n M.D. AVIATION SERVICES RZBAATWW RAIFFEISEN INV:03092019 DATE 03/09/19\r\n 04/09/19 ASW97492/030919 190904088137000 14,371.58 CR 38410000000197 LTD 30 SHD. GOSHEN BANK INTERNATIONAL AG\r\n GLOBAL SPIRITS GROUP MUNIUA22 / TASCOMBANK INVOICE 12 DD 09.08.19 ACC. TO CONT RACT\r\n 05/09/19 190905032684049 190905035088049 300.00 DB 38410000000213 LLC 12 VYACHESLAV JSC (FORMERLY BANK N 06/2019-A DD 13.07.19 FOR AD VERTISING\r\n LLC WORLD TRADE BAGAGE22 / BANK OF INVOICE 809 DD 27.08.19 ACC TO CON TRACT\r\n 05/09/19 190905032676049 190905035147049 6,160.00 DB 38410000000213 COMPANY GEORGI GEORGIA N 071218 DD 07/12/18 FOR TRAN SPORTATION\r\n´³ÝϳÛÇÝ·³ÕïÝÇù*\r\n 1\r\n"
#using tabulizer package
statement <- extract_tables(
file = f,
method = "decide")
str(statement)
#> List of 1
#> $ : chr [1:20, 1:9] "2Ã\231ë3ÃÇÃ""""""02/09/19" ...
statement
#> [[1]]
#> [,1] [,2] [,3]
#> [1,] "2Ã\231ë3ÃÇÃ""ö 3ëï3ÃÕÃÇ""ö 3ëï3ÃÕÃÇ"
#> [2,] """Ñ3Ã\2313ñ""ÑÕÃ\2313Ã\235 Ñ3Ã\2313ñÃ\201"
#> [3,] """"""
#> [4,] "02/09/19""190902021464049""190902049382049"
#> [5,] """"""
#> [6,] "02/09/19""190902021461049""190902049391049"
#> [7,] """"""
#> [8,] "03/09/19""190903041599049""190903047747049"
#> [9,] """"""
#> [10,] "03/09/19""190903041597049""190903047761049"
#> [11,] """"""
#> [12,] "03/09/19""190903041591049""190903047819049"
#> [13,] """"""
#> [14,] "04/09/19""ASW07394/040919""190904088136000"
#> [15,] """"""
#> [16,] "04/09/19""ASW97492/030919""190904088137000"
#> [17,] """"""
#> [18,] "05/09/19""190905032684049""190905035088049"
#> [19,] """"""
#> [20,] "05/09/19""190905032676049""190905035147049"
#> [,4] [,5] [,6]
#> [1,] "¶áõÃ\2313ñ DB/""""êï3óáÕÇ/ì×3ñáÕÇ"
#> [2,] """CR""Ñ3ßÃÇ Ñ3Ã\2313ñ"
#> [3,] """"""
#> [4,] "7,336.83""DB""38410000000213"
#> [5,] """"""
#> [6,] "12,260.20""DB""38410000000213"
#> [7,] """"""
#> [8,] "20,082.24""DB""38410000000213"
#> [9,] """"""
#> [10,] "93,139.20""DB""38410000000213"
#> [11,] """"""
#> [12,] "41,015.88""DB""38410000000213"
#> [13,] """"""
#> [14,] "14,307.58""CR""38410000000197"
#> [15,] """"""
#> [16,] "14,371.58""CR""38410000000197"
#> [17,] """"""
#> [18,] "300.00""DB""38410000000213"
#> [19,] """"""
#> [20,] "6,160.00""DB""38410000000213"
#> [,7] [,8]
#> [1,] "êï3óáÕÇ/ì×3ñáÕÇ""êï3óáÕÇ/Ã×3ñáÕÇ"
#> [2,] "3Ã\235áõÃ\235Ã\201/3Ã\235Ã3Ã\235áõÃ\231Ã\201""μ3Ã\235Ã\217"
#> [3,] "PEPSICO HOLDINGS LLC""BLICRUMM / HSBC BANK"
#> [4,] "141580,RU SSIA,MOSCOW""(RR) OOO"
#> [5,] "SANDORA LTD57262,""CITIUAUK / CITIBANK"
#> [6,] "UKRAINA, N IKOLAEVSKAYA""(UKRAINE)"
#> [7,] "JSC PERMALKO,""AVTBRUMMXXX / URALSIB"
#> [8,] "RUSSIA,614990,G.PERM,""BANK OAO"
#> [9,] "OOO RODNIK I K""AVTBRUMMXXX / URALSIB"
#> [10,] "RUSSIA,MOSKOVSKA YA""BANK OAO"
#> [11,] "GLOBAL SPIRITS GROUP""MUNIUA22 / TASCOMBANK"
#> [12,] "LLC12 VYACHESLAV""JSC (FORMERLY BANK"
#> [13,] "ABRAHAM JACOBI- THE""RZBAATWW RAIFFEISEN"
#> [14,] "BEER STORE 3-22 S.Y.""BANK INTERNATIONAL AG"
#> [15,] "M.D. AVIATION SERVICES""RZBAATWW RAIFFEISEN"
#> [16,] "LTD 30 SHD. GOSHEN""BANK INTERNATIONAL AG"
#> [17,] "GLOBAL SPIRITS GROUP""MUNIUA22 / TASCOMBANK"
#> [18,] "LLC12 VYACHESLAV""JSC (FORMERLY BANK"
#> [19,] "LLC WORLD TRADE""BAGAGE22 / BANK OF"
#> [20,] "COMPANYGEORGI""GEORGIA"
#> [,9]
#> [1,] "Üå3ï3Ã\217Ã\201"
#> [2,] ""
#> [3,] "INVOICE 03/00362660-19 DD 07.08.19A CC. TO"
#> [4,] "CONTRACT N PS/AD 001/02-18D D 14.02.18"
#> [5,] "INV 32015 DD 06.08.19 ACC. TO CONT RACT N"
#> [6,] "S-19-3972 DD 01.06.2019 FOR NATURAL"
#> [7,] "INVOICE 255 DD 03.09.19 ACC. TO C"
#> [8,] "ONTRACT N282-15 DTD. 16.09.2015 FO R"
#> [9,] "INVOICES 184-190 DD 20.08.19 ACC . TO"
#> [10,] "CONTRACT N62-M DD 10.05.2016F OR"
#> [11,] "INVOICES 18,19 DD 23.08.19 ACC. TOC"
#> [12,] "ONTRACT N 06/2019-A DD 13.07.19 FOR"
#> [13,] ""
#> [14,] ""
#> [15,] "INV:03092019DATE 03/09/19"
#> [16,] ""
#> [17,] "INVOICE 12 DD 09.08.19 ACC. TO CONT RACT"
#> [18,] "N 06/2019-A DD 13.07.19 FOR AD VERTISING"
#> [19,] "INVOICE 809 DD 27.08.19 ACC TO CON TRACT"
#> [20,] "N 071218 DD 07/12/18 FOR TRAN SPORTATION"
Created on 2020-01-07 by the reprex package (v0.3.0)
Both options return long rows of unstructured and messy data. Is there any other way to extract these type of data from the pdf files (to get the tables as data frames) or I have to clean and tidy these data? You can find the file here : statement USD