-
Notifications
You must be signed in to change notification settings - Fork 0
/
zotero-pdf-biblio.bib
9322 lines (8675 loc) · 881 KB
/
zotero-pdf-biblio.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{lindenhoferCuriousExplorationMalicious2020,
address = {Valletta, Malta},
title = {A {Curious} {Exploration} of {Malicious} {PDF} {Documents}:},
isbn = {978-989-758-399-5},
shorttitle = {A {Curious} {Exploration} of {Malicious} {PDF} {Documents}},
url = {http://www.scitepress.org/DigitalLibrary/Link.aspx?doi=10.5220/0008992305770584},
doi = {10.5220/0008992305770584},
abstract = {The storage, modification and exchange of digital information are core processes in our internet connected world. Common document formats enable this digital information infrastructure. More specifically, the widely used PDF document format is a commodity container for digital information. Although PDF files are a well established format, users may not know that they contain not only simple textual information, but can also embed pieces of program code, sometimes malicious code. This paper explores the capabilities of the PDF format and the potential of its built-in functions for malicious purposes. PDF file processors that implement the full PDF standard also potentially enable credential phishing, loss of privacy, malicious code execution and similar attacks via PDF documents. Furthermore, this paper discusses the results of practically evaluated, working code snippets of PDF feature misuse and strategies to obfuscate and hide malicious code parts in a PDF document, while still conforming to the PDF standard.},
language = {en},
urldate = {2021-03-06},
booktitle = {Proceedings of the 6th {International} {Conference} on {Information} {Systems} {Security} and {Privacy}},
publisher = {SCITEPRESS - Science and Technology Publications},
author = {Lindenhofer, Julian and Offenthaler, Rene and Pirker, Martin},
year = {2020},
pages = {577--584},
file = {Lindenhofer et al. - 2020 - A Curious Exploration of Malicious PDF Documents.pdf:/Users/tullsen/Zotero/storage/U7UF8YVG/Lindenhofer et al. - 2020 - A Curious Exploration of Malicious PDF Documents.pdf:application/pdf},
}
@article{leeGrapplingScaleBornDigital2021,
title = {Grappling with the {Scale} of {Born}-{Digital} {Government} {Publications}: {Toward} {Pipelines} for {Processing} and {Searching} {Millions} of {PDFs}},
shorttitle = {Grappling with the {Scale} of {Born}-{Digital} {Government} {Publications}},
url = {http://arxiv.org/abs/2112.02471},
abstract = {Official government publications are key sources for understanding the history of societies. Web publishing has fundamentally changed the scale and processes by which governments produce and disseminate information. Significantly, a range of web archiving programs have captured massive troves of government publications. For example, hundreds of millions of unique U.S. Government documents posted to the web in PDF form have been archived by libraries to date. Yet, these PDFs remain largely unutilized and understudied in part due to the challenges surrounding the development of scalable pipelines for searching and analyzing them. This paper utilizes a Library of Congress dataset of 1,000 government PDFs in order to offer initial approaches for searching and analyzing these PDFs at scale. In addition to demonstrating the utility of PDF metadata, this paper offers computationally-efficient machine learning approaches to search and discovery that utilize the PDFs' textual and visual features as well. We conclude by detailing how these methods can be operationalized at scale in order to support systems for navigating millions of PDFs.},
urldate = {2022-01-23},
journal = {arXiv:2112.02471 [cs]},
author = {Lee, Benjamin Charles Germain and Owens, Trevor},
month = dec,
year = {2021},
note = {arXiv: 2112.02471
22 pages, 4 figures},
keywords = {Computer Science - Digital Libraries, Computer Science - Information Retrieval},
file = {Lee and Owens - 2021 - Grappling with the Scale of Born-Digital Governmen.pdf:/Users/tullsen/Zotero/storage/QT2WGCAN/Lee and Owens - 2021 - Grappling with the Scale of Born-Digital Governmen.pdf:application/pdf},
}
@inproceedings{djukicDomainspecificModelingDocument2021,
address = {Limerick Ireland},
title = {Domain-specific modeling in document engineering},
isbn = {978-1-4503-8596-1},
url = {https://dl.acm.org/doi/10.1145/3469096.3470949},
doi = {10.1145/3469096.3470949},
language = {en},
urldate = {2021-08-17},
booktitle = {Proceedings of the 21st {ACM} {Symposium} on {Document} {Engineering}},
publisher = {ACM},
author = {Djukić, Verislav and Tolvanen, Juha-Pekka},
month = aug,
year = {2021},
pages = {1--2},
}
@article{mosheExploitingURLParsers2022,
title = {Exploiting {URL} parsers: {The} {Good}, {Bad}, and {Inconsistent}},
url = {https://claroty.com/wp-content/uploads/2022/01/Exploiting-URL-Parsing-Confusion.pdf},
abstract = {The Uniform Resource Locator (URL) is integral to our lives online because we use it for surfing the web, accessing files, and joining video chats. If you click on a URL or type it into a browser, you’re requesting a resource hosted somewhere online. As a result, some devices such as our browsers, applications, and servers must receive our URL, parse it into its uniform resource identifier (URI) components (e.g. hostname, path, etc.) and fetch the requested resource.
The syntax of URLs is complex, and although different libraries can parse them accurately, it is plausible for the same URL
to be parsed differently by different libraries. The confusion in URL parsing can cause unexpected behavior in the software
(e.g. web application), and could be exploited by threat actors to cause denial-of-service conditions, information leaks, or
possibly conduct remote code execution attacks.
In Team82's joint research with Snyk, we examined 16 URL parsing libraries, written in a variety of programming
languages, and noticed some inconsistencies with how each chooses to parse a given URL to its basic components. We
categorized the types of inconsistencies into five categories, and searched for problematic code flows in web applications
and open source libraries that exposed a number of vulnerabilities.
We learned that most of the eight vulnerabilities we found largely occurred for two reasons:
1. Multiple Parsers in Use: Whether by design or an oversight, developers sometimes use more than one URL parsing library in projects. Because some libraries may parse the same URL differently, vulnerabilities could be introduced
into the code.
2. Specification Incompatibility: Different parsing libraries are written according to different RFCs or URL specifications, which creates inconsistencies by design. This also leads to vulnerabilities because developers may not be familiar with the differences between URL specifications and their implications (e.g. what should be checked or sanitized)},
language = {English},
author = {Moshe, Noam and {Sharon Brizinov} and {Raul Onitza-Klugman} and {Kirill Efimov}},
month = jan,
year = {2022},
pages = {34},
file = {Moshe - 2021 - EXPLOITING URL PARSERS THE GOOD, BAD, AND INCONSI.pdf:/Users/tullsen/Zotero/storage/BFA7CJED/Moshe - 2021 - EXPLOITING URL PARSERS THE GOOD, BAD, AND INCONSI.pdf:application/pdf},
}
@misc{noammosheExploitingURLParsing2022,
title = {Exploiting {URL} {Parsing} {Confusion} {Vulnerabilities}},
url = {https://claroty.com/2022/01/10/blog-research-exploiting-url-parsing-confusion/},
abstract = {A joint Claroty Team82-Snyk research collaboration uncovered URL parsing confusion vulnerabilities in popular parsing libraries.},
language = {English},
urldate = {2022-01-15},
journal = {Claroty},
author = {{Noam Moshe} and {Sharon Brizinov}},
month = jan,
year = {2022},
}
@inproceedings{singhPDFClassificationUsing2022,
address = {Singapore},
series = {Lecture {Notes} in {Networks} and {Systems}},
title = {{PDF} {Classification} {Using} {Logistic} {Regression} and {Latent} {Dirichlet} {Allocation}},
isbn = {9789811664076},
url = {https://link.springer.com/chapter/10.1007/978-981-16-6407-6_36},
doi = {10.1007/978-981-16-6407-6_36},
abstract = {Over the past few years, the classification of documents has been a challenging task. For document classification, no effective method or structured process has been developed so far to assign suitable labels to a large set of documents. Hence there is a need to develop an efficient and accurate method for the classification of PDF documents. The paper proposes a novel method which is capable of classifying PDF documents into most appropriate class and subclass. The proposed framework uses Machine learning with logistic regression classifier as a supervised approach along with Latent Dirichlet allocation (LDA). A combination of two different datasets has been used, the first dataset is ARXIV data of more than 31000 research paper meta data dated between 1992 and 2017 and the second one is created by the authors on their own. The method achieved a significant accuracy comparable with the existing approaches.},
language = {English},
booktitle = {Proceedings of the 2nd {International} {Conference} on {Recent} {Trends} in {Machine} {Learning}, {IoT}, {Smart} {Cities} and {Applications}},
publisher = {Springer},
author = {Singh, Divyanshu and Bhatnagar, Mansi and Yadav, Vrinda},
editor = {Gunjan, Vinit Kumar and Zurada, Jacek M.},
month = jan,
year = {2022},
keywords = {LDA, Machine learning, Natural language processing, PDF classification},
pages = {399--407},
}
@incollection{vishnuPDFMalwareClassifiers2022,
address = {Boca Raton, USA},
edition = {1},
title = {{PDF} {Malware} {Classifiers} – {A} {Survey}, {Future} {Directions} and {Recommended} {Methodology}},
isbn = {978-0-367-80822-8},
url = {https://www.taylorfrancis.com/chapters/edit/10.1201/9780367808228-7/pdf-malware-classifiers-survey-future-directions-recommended-methodology-vishnu-sripada-manasa-lakshmi-kavita-sahil-verma-awadhesh-kumar-shukla},
abstract = {Malicious software continues to pose a major threat to the cyber world. Text files are the most frequently used vectors to infect various systems using malware. In all this, to execute the attack, the intruder attempts to merge the malignant code with the benevolent text data. Due to its compatibility and lightweight characteristics, PDF (portable document format) is the most widely used file method of sharing documents. In today's world, attackers are using cutting-edge methods to obfuscate malware concealed inside document files. So, it is difficult for malware detection classifiers to effectively identify the text. To understand their design and working procedures, we surveyed different types of learning-based PDF malware classifiers. Also, we have described the pdf document by which we can understand the workings of malware. Finally, we recommended a methodology on the basis of the literature survey and specified the future direction for the better classification results. This work is the extension of dissertation.},
language = {English},
booktitle = {Information {Security} {Handbook}},
publisher = {CRC Press},
author = {Vishnu, N. S. and Lakshmi, Sripada Manasa and Kavita and Verma, Sahil and Shukla, Awadhesh Kumar},
month = feb,
year = {2022},
pages = {24},
}
@misc{alghamdiExtractingToCMetadata2022,
title = {Extracting {ToC} and {Metadata} from {PDF} {Books}: {A} {Rule}-{Based} {Approach}},
shorttitle = {Extracting {ToC} and {Metadata} from {PDF} {Books}},
url = {https://doi.org/10.24507/icicelb.13.02.133},
abstract = {In recent years, e-books in PDF format have been relied upon as a huge repository of printed form of knowledge. More than 150 publishing houses in various academic, industrial, and other fields are interested in developing e-books for ease of use, printing, and sharing over the Internet. However, the methods of automatic extraction of e-book information in PDF format are not easy, and they need continuous improvement and development to enhance the efficient usage of e-books by means of searching over the web, indexing by search engines and archiving/retrieval by the digital libraries. Thus, this paper proposes a high-accuracy rule-based approach to extracting the metadata (book title, author’s name, and year of publication) and table of contents (section number, section title, and section page number) from PDF books. The proposed approach achieved 89.22\% accuracy of information extraction in contrast to classical schemes of literature.},
language = {English},
urldate = {2021-12-29},
publisher = {ICIC International},
author = {Alghamdi, Huda and Dawwas, Waad and Almutairi, Taghreed H and , Atta-ur-Rahman},
year = {2022},
note = {ICIC Express Letters
Part B: Applications
ICIC International
ISSN 2185-2766
Volume 13, Number 2, February 2022 pp. 133–143},
file = {Alghamdi et al. - 2022 - Extracting ToC and Metadata from PDF Books A Rule.pdf:/Users/tullsen/Zotero/storage/WSZ2YF5B/Alghamdi et al. - 2022 - Extracting ToC and Metadata from PDF Books A Rule.pdf:application/pdf},
}
@inproceedings{reckerGPUAcceleratedPDF2011,
address = {San Francisco, USA},
title = {A {GPU} accelerated {PDF} transparency engine},
volume = {7872},
url = {https://www.spiedigitallibrary.org/conference-proceedings-of-spie/7872/78720T/A-GPU-accelerated-PDF-transparency-engine/10.1117/12.872568.short},
doi = {10.1117/12.872568},
abstract = {As commercial printing presses become faster, cheaper and more efficient, so too must the Raster Image Processors (RIP) that prepare data for them to print. Digital press RIPs, however, have been challenged to on the one hand meet the ever increasing print performance of the latest digital presses, and on the other hand process increasingly complex documents with transparent layers and embedded ICC profiles. This paper explores the challenges encountered when implementing a GPU accelerated driver for the open source Ghostscript Adobe PostScript and PDF language interpreter targeted at accelerating PDF transparency for high speed commercial presses. It further describes our solution, including an image memory manager for tiling input and output images and documents, a PDF compatible multiple image layer blending engine, and a GPU accelerated ICC v4 compatible color transformation engine. The result, we believe, is the foundation for a scalable, efficient, distributed RIP system that can meet current and future RIP requirements for a wide range of commercial digital presses.},
language = {English},
urldate = {2019-04-22},
booktitle = {Parallel {Processing} for {Imaging} {Applications}},
publisher = {International Society for Optics and Photonics},
author = {Recker, John and Lin, I.-Jong and Tastl, Ingeborg},
month = jan,
year = {2011},
note = {HP Labs
https://www.researchgate.net/publication/241307379\_A\_GPU\_accelerated\_PDF\_transparency\_engine},
pages = {78720T},
annote = {Summary
An explanation of how HP enhanced/modified GhostScript (OSS) to utilise GPUs to improve rendering performance. Note that this is from 2010 so well before GPUs were main-stream - and before Adobe (for example) added GPU support to Acrobat and Reader.
Provided as an explanation of how non-traditional software techniques are being applied to PDF parsing...},
file = {Recker et al. - 2011 - A GPU accelerated PDF transparency engine.pdf:/Users/tullsen/Zotero/storage/6SE8FPRF/Recker et al. - 2011 - A GPU accelerated PDF transparency engine.pdf:application/pdf},
}
@article{anuragaImplementationEvaluationPDF2021,
title = {An {Implementation} and {Evaluation} of {PDF} {Password} {Cracking} {Using} {John} the {Ripper} {And} {Crunch}},
volume = {3},
copyright = {Creative Commons Attribution 4.0 International, Open Access},
issn = {78-93-5426-386-6},
url = {https://zenodo.org/record/5112693},
doi = {10.5281/ZENODO.5112693},
abstract = {It can be challenging to choose the most effective wordmangling rules to apply while undertaking a dictionary-based password cracking attempt. We discuss a new method for generating password structures in the highest possibility order in this work. Based on a training set of previously revealed passwords, we first build an artificial probabilistic context-free grammar. As a result of this grammar, we can generate word-mangling rules and, as a result, password guesses for password cracking. By putting our tools and strategies to the test on genuine password sets, we will show that this strategy appears to be a more effective way to crack passwords than traditional methods. Our approach cracked 28 percent to 129 percent more passwords than John the Ripper, a publicly available standard password cracking software, in one set of testing. We'll construct a wordlist for dictionary attack using the Crunch tool.},
language = {English},
number = {1},
urldate = {2021-12-28},
journal = {Proceedings of the National Conference on Emerging Computer Applications (NCECA)-2021},
author = {Anurag A and Mercy Joseph},
month = jul,
year = {2021},
note = {Publisher: Zenodo
https://nceca.in/2021/4An\_Implementation\_and\_Evaluation\_of\_PDF\_Password\_Cracking\_Using\_John\_the\_Ripper\_and\_Crunch.pdf},
keywords = {Bruteforce, Crunch, Dictionary Attack, John the Ripper, Kali Linux, PDF, Vulnerabilities},
pages = {18--22},
file = {Anurag A and Mercy Joseph - 2021 - An Implementation and Evaluation of PDF Password C.pdf:/Users/tullsen/Zotero/storage/9FVM8D4W/Anurag A and Mercy Joseph - 2021 - An Implementation and Evaluation of PDF Password C.pdf:application/pdf},
}
@inproceedings{sharifSecuringIntegrityPDF2021,
address = {Medan, Indonesia},
title = {Securing the {Integrity} of {PDF} {Files} using {RSA} {Digital} {Signature} and {SHA}-3 {Hash} {Function}},
copyright = {Copyright IEEE},
isbn = {978-1-66542-680-0},
url = {https://ieeexplore.ieee.org/abstract/document/9650121},
doi = {10.1109/DATABIA53375.2021.9650121},
abstract = {Signatures are used on documents as written proof that the document was verified by the person indicated. Signature also indicated that the document originated from the signer if the document is transferred to another party. A document maybe in physical print form but may also be a digital print. A digital print requires additional security since a digital document may easily be altered by anyone although the said document is signed using a photographed or scanned signature. One of the means of security is by using the RSA Digital Signature method which is a combination of the RSA algorithm with Digital Signature. RSA algorithm is one of the public key cryptography algorithms, while Digital Signature is a security scheme which may guarantee the authenticity, non-repudiation, and integrity of a file by means of a hash function. This research implemented a web-based combination of RSA Digital Signature with SHA-3 hash function to secure the integrity of PDF files using PHP programming language. The result is a web-based system which could guarantee the authenticity, non repudiation and integrity of PDF files. Testing were carried out on six different sizes of PDF files ranging from 6 KB, up to 23285 KB on three different web browsers: Google Chrome, Microsoft Edge, and Mozilla Firefox. Average processing times of signing and verifying on each browsers were 1.3309 seconds, 1.2565 seconds, and 1.2667 seconds.},
language = {English},
booktitle = {2021 {International} {Conference} on {Data} {Science}, {Artificial} {Intelligence}, and {Business} {Analytics} ({DATABIA})},
publisher = {IEEE},
author = {Sharif, Amer and Ginting, Dewi S. and Dias, Arya D.},
month = nov,
year = {2021},
keywords = {authentication, Data science, Databases, digital signature, Distance measurement, file integrity, Hash functions, Keccak, non repudiation, Portable document format, Public key cryptography, Receivers, RSA algorithm, SHA-3, signing, verification},
pages = {154--159},
}
@inproceedings{adhataraoHowArePDF2021,
address = {Montpellier, France},
title = {How are {PDF} files published in the {Scientific} {Community}?},
isbn = {978-1-66541-717-4},
url = {https://ieeexplore.ieee.org/abstract/document/9648374},
doi = {10.1109/WIFS53200.2021.9648374},
abstract = {Authors are often not aware of hidden information and that they can contain more information than the actual content of the file. This work mainly focuses on how PDF files are published in the scientific community. We have analyzed a corpus of 555865 PDF files to show that direct and modified authoring process of PDF creations leads to the leakage of sensitive information on the researchers. Our analysis on the extraction of the metadata has shown that at least 23\% of the PDF files in our dataset contains valuable information on the authoring process. We were even able to solve the co-authorship (multiple authors) problem by crossing the information of multiple PDF files using linear algebra. We believe that, PDF sanitization needs to be included in the scientific publication processes to avoid leakage of sensitive information. We have explored and suggested necessary strategies available for the safer distribution of scientific work by researchers.},
language = {English},
booktitle = {2021 {IEEE} {International} {Workshop} on {Information} {Forensics} and {Security} ({WIFS})},
publisher = {IEEE},
author = {Adhatarao, Supriya and Lauradoux, Cédric},
month = dec,
year = {2021},
note = {ISSN: 2157-4774},
keywords = {Conferences, Cryptography, Data mining, Forensics, Linear algebra, metadata, Metadata, PDF files, sanitization},
pages = {1--6},
}
@incollection{strantzAltTextCreatingAccessible2021,
address = {New York, NY, USA},
series = {Proceedings of {SIGDOC}'21},
title = {Beyond "{Alt}-{Text}": {Creating} {Accessible} {Data} {Visualizations} with {Code}},
isbn = {978-1-4503-8628-9},
shorttitle = {Beyond \&\#x201c;{Alt}-{Text}\&\#x201d;},
url = {https://doi.org/10.1145/3472714.3473661},
abstract = {Data visualization is a reliable tool for professional communication practitioners for synthesizing and presenting data to a variety of audiences. However, data visualizations have a range of accessibility concerns including: visual acuity, color/contrast difficulties, color blindness and size/scale issues. Alt-text is not enough to make these visuals accessible and therefore more advanced web coding techniques, such as the Scalable Vector Graphic (SVG) format should be used to create data visualizations for the web. The use of SVG allows for greater coded semantic and contextual information to be added to data visualizations resulting in graphics that can be better interacted with by users with a variety of accessibility software.},
language = {English},
number = {39},
urldate = {2021-12-20},
booktitle = {The 39th {ACM} {International} {Conference} on {Design} of {Communication}},
publisher = {Association for Computing Machinery},
author = {Strantz, Adam},
month = oct,
year = {2021},
keywords = {Accessibility, Code, Data Visualization, Design, Scalable Vector Graphics},
pages = {331--337},
file = {Strantz - 2021 - Beyond Alt-Text Creating Accessible Data Visual.pdf:/Users/tullsen/Zotero/storage/U6PABFVZ/Strantz - 2021 - Beyond Alt-Text Creating Accessible Data Visual.pdf:application/pdf},
}
@phdthesis{sebekEvaluationUNetMultilabel2021,
address = {Stockholm, Sweden},
type = {Master of {Science} - {Machine} {Learning}},
title = {An evaluation of {U}-{Net}’s multi-label segmentation performance on {PDF} documents in a medical context},
url = {http://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-306046},
abstract = {The Portable Document Format (PDF) is an ideal format for viewing and printing documents. Today many companies store their documents in a PDF format. However, the conversion from a PDF document to any other structured format is inherently difficult. As a result, a lot of the information contained in a PDF document is not directly accessible - this is problematic. Manual intervention is required to accurately convert a PDF into another file format - this can be deemed as both strenuous and exhaustive work. An automated solution to this process could greatly improve the accessibility to information in many companies. A significant amount of literature has investigated the process of extracting information from PDF documents in a structured way. In recent years these methodologies have become heavily dependent on computer vision. The work on this paper evaluates how the U-Net model handles multi-label segmentation on PDF documents in a medical context - extending on Stahl et al.’s work in 2018. Furthermore, it compares two newer extensions of the U-Net model, MultiResUNet (2019) and SS-U-Net (2021). Additionally, it assesses how each of the models performs in a data-sparse environment. The three models were implemented, trained, and then evaluated. Their performance was measured using the Dice coefficient, Jaccard coefficient, and percentage similarity. Furthermore, visual inspection was also used to analyze how the models performed from a perceptual standpoint. The results indicate that both the U-Net and the SS-U-Net are exceptional at segmenting PDF documents effectively in a data abundant environment. However, the SS-U-Net outperformed both the U-Net and the MultiResUNet in the data-sparse environment. Furthermore, the MultiResUNet significantly underperformed in comparison to both the U-Net and SS-U-Net models in both environments. The impressive results achieved by the U-Net and SS-U-Net models suggest that it can be combined with a larger system. This proposed system allows for accurate and structured extraction of information from PDF documents.},
language = {English},
urldate = {2021-12-19},
school = {KTH ROYAL INSTITUTE OF TECHNOLOGY, SCHOOL OF ELECTRICAL ENGINEERING AND COMPUTER SCIENCE},
author = {Sebek, Fredrik},
month = aug,
year = {2021},
note = {https://www.diva-portal.org/smash/record.jsf?pid=diva2\%3A1619512\&dswid=942},
file = {Sebek - 2021 - An evaluation of U-Net’s multi-label segmentation .pdf:/Users/tullsen/Zotero/storage/E6REEGYG/Sebek - 2021 - An evaluation of U-Net’s multi-label segmentation .pdf:application/pdf},
}
@inproceedings{indartaDevelopmentEModuleCourses2021,
title = {Development of {E}-{Module} {Courses} {Tata} {Boga} 2 {Based} on {Flip} {PDF} {Professional} for {Teaching} {Learning} {Process} in {The} {Pandemic} of {Covid} 19},
isbn = {978-94-6239-479-7},
url = {https://www.atlantis-press.com/proceedings/ictvet-21/125965548},
doi = {10.2991/assehr.k.211208.029},
abstract = {This study aims to determine the results of the validity and practicality of e-modules made using Flip PDF Professional. The research model used was developed by Borg and Gall which consists of 10 steps but only 7 steps were adapted due to time and funding constraints. The validity test was carried out by 2 experts, namely Material and Media Experts....},
language = {en},
urldate = {2021-12-17},
publisher = {Atlantis Press},
author = {Indarta, Yose and Dewi, Ika Parma and Ambiyar and Syahril and Fadhilah and Asnur, Lise and Ranuharja, Fadhli and Samala, Agariadne Dwinggo},
month = dec,
year = {2021},
note = {ISSN: 2352-5398},
pages = {174--179},
file = {Indarta et al. - 2021 - Development of E-Module Courses Tata Boga 2 Based .pdf:/Users/tullsen/Zotero/storage/S7C4L9RE/Indarta et al. - 2021 - Development of E-Module Courses Tata Boga 2 Based .pdf:application/pdf},
}
@misc{isotc171sc2wg8ISODTS69122021,
title = {{ISO}/{DTS} 6912 {Document} management – {Portable} {Document} {Format} – {Clarification} for initial graphics state in {ISO} 32000-2 ({PDF} 2.0},
copyright = {Copyright ISO},
abstract = {This document clarifies the parameter values for the initial graphics state of different types of content streams that are defined in ISO 32000-2, Document management — Portable document format — Part 2: PDF 2.0. This ensures a consistent rendered appearance under all conditions.},
language = {English},
publisher = {ISO},
author = {{ISO TC 171 SC 2 WG 8}},
month = jun,
year = {2021},
}
@inproceedings{kaushikOffensiveApproachHiding2021,
address = {Singapore},
series = {Lecture {Notes} on {Data} {Engineering} and {Communications} {Technologies}},
title = {An {Offensive} {Approach} for {Hiding} {Malicious} {Payloads} in an {Image}},
isbn = {9789811639616},
url = {https://link.springer.com/chapter/10.1007/978-981-16-3961-6_23},
doi = {10.1007/978-981-16-3961-6_23},
abstract = {Steganography is the oldest technique that is been used from century, steganography purpose has not changed, i.e., all these techniques aim at hiding data or protecting data. With the help of steganalysis, the media can be analyzed to check for the presence of any secret information. Nowadays, attackers are making the use of advanced steganography approaches to conceal the secret information and communicate in a stealth manner. In this paper, the authors have discussed about the novel approach to hide malicious payload into image metadata. Therefore, metadata is a data that describes about the image rights and its administration. Hacker generally uses this metadata to perform various malicious attacks such embedding malicious script inside the image metadata and many more.},
language = {English},
booktitle = {Cyber {Security} and {Digital} {Forensics}},
publisher = {Springer},
author = {Kaushik, Keshav and Surana, Sneha},
editor = {Khanna, Kavita and Estrela, Vania Vieira and Rodrigues, Joel José Puga Coelho},
month = oct,
year = {2021},
keywords = {Metadata, Cyber forensics, Cybersecurity, Digital forensics, EXIF, Image steganography, Payload, Stager, Steganalysis, Steganography},
pages = {265--272},
}
@article{beckwithNeedleHaystackDetecting2021,
title = {Needle in a {Haystack}: {Detecting} {Subtle} {Malicious} {Edits} to {Additive} {Manufacturing} {G}-code {Files}},
issn = {1943-0671},
shorttitle = {Needle in a {Haystack}},
url = {https://ieeexplore.ieee.org/abstract/document/9619477},
doi = {10.1109/LES.2021.3129108},
abstract = {Increasing usage of Digital Manufacturing (DM) in safety-critical domains is increasing attention on the cybersecurity of the manufacturing process, as malicious third parties might aim to introduce defects in digital designs. In general, the DM process involves creating a digital object (as CAD files) before using a slicer program to convert the models into printing instructions (e.g. g-code) suitable for the target printer. As the g-code is an intermediate machine format, malicious edits may be difficult to detect, especially when the golden (original) models are not available to the manufacturer. In this work we aim to quantify this hypothesis through a red-team/blue-team case study, whereby the red-team aims to introduce subtle defects that would impact the properties (strengths) of the 3D printed parts, and the blue-team aims to detect these modifications in the absence of the golden models. The case study had two sets of models, the first with 180 designs (with 2 compromised using 2 methods) and the second with 4320 designs (with 60 compromised using 6 methods). Using statistical modelling and machine learning (ML), the blue-team was able to detect all the compromises in the first set of data, and 50 of the compromises in the second.},
journal = {IEEE Embedded Systems Letters},
author = {Beckwith, Caleb and Naicker, Harsh Sankar and Mehta, Svara and Udupa, Viba R. and Nim, Nghia Tri and Gadre, Varun and Pearce, Hammond and Mac, Gary and Gupta, Nikhil},
month = nov,
year = {2021},
note = {Conference Name: IEEE Embedded Systems Letters},
keywords = {Machine learning, Clustering algorithms, Manufacturing, Principal component analysis, Printers, Solid modeling, Three-dimensional displays},
pages = {1--1},
}
@inproceedings{hussainReviewMaliciousAltering2021,
address = {Zallaq, Bahrain},
title = {A {Review} of {Malicious} {Altering} {Healthcare} {Imagery} using {Artificial} {Intelligence}},
isbn = {978-1-66544-032-5},
url = {https://ieeexplore.ieee.org/document/9582068/},
doi = {10.1109/3ICT53449.2021.9582068},
abstract = {During the second half of 2020, healthcare is and has been the number one target for cybercrime, enormous amount of cyberattacks on hospitals and health systems increased, and specialists trust there are more to come. Attackers who can get the way to reach the electronic health record would exploit it and will use it for their own interest like deal or vend it on the underground economy, hostage the systems and the sensitive data, that has a significant impact on operations. This review tried to analyze how cyber attacker employ Generative Adversarial Networks (GANs) to alter the evidences of patient’s medical conditions from image scans and reports. Cyber attacker has different purposes in order to obstruct a political applicant, lockup investigations, obligate insurance scam, execute an act of violence, or even commit homicide. Numerous correlated works constructed on gan in medical images practices had been reviews in the period between 2000 to 2021. Many papers showed how hospital system, physicians and radiology’s specialists and the most recent researches showed an extremely exposed to different types of intrusion gan attacks.},
language = {en},
urldate = {2021-11-20},
booktitle = {2021 {International} {Conference} on {Innovation} and {Intelligence} for {Informatics}, {Computing}, and {Technologies} ({3ICT})},
publisher = {IEEE},
author = {Hussain, Fadheela and Ksantini, Riadh and Hammad, Mustafa},
month = sep,
year = {2021},
pages = {646--651},
file = {Hussain et al. - 2021 - A Review of Malicious Altering Healthcare Imagery .pdf:/Users/tullsen/Zotero/storage/XQWBQQKF/Hussain et al. - 2021 - A Review of Malicious Altering Healthcare Imagery .pdf:application/pdf},
}
@techreport{manharmohammedHAPSSAHolisticApproach2021,
address = {San Diego, CA, USA},
title = {{HAPSSA}: {Holistic} {Approach} to {PDF} {Malware} {Detection} {Using} {Signal} and {Statistical} {Analysis}},
shorttitle = {{HAPSSA}},
url = {https://ui.adsabs.harvard.edu/abs/2021arXiv211104703M},
abstract = {Malicious PDF documents present a serious threat to various security organizations that require modern threat intelligence platforms to effectively analyze and characterize the identity and behavior of PDF malware. State-of-the-art approaches use machine learning (ML) to learn features that characterize PDF malware. However, ML models are often susceptible to evasion attacks, in which an adversary obfuscates the malware code to avoid being detected by an Antivirus. In this paper, we derive a simple yet effective holistic approach to PDF malware detection that leverages signal and statistical analysis of malware binaries. This includes combining orthogonal feature space models from various static and dynamic malware detection methods to enable generalized robustness when faced with code obfuscations. Using a dataset of nearly 30,000 PDF files containing both malware and benign samples, we show that our holistic approach maintains a high detection rate (99.92\%) of PDF malware and even detects new malicious files created by simple methods that remove the obfuscation conducted by malware authors to hide their malware, which are undetected by most antiviruses.},
language = {English},
urldate = {2021-11-13},
institution = {IEEE},
author = {Manhar Mohammed, Tajuddin and Nataraj, Lakshmanan and Chikkagoudar, Satish and Chandrasekaran, Shivkumar and Manjunath, B. S.},
month = nov,
year = {2021},
note = {Publication Title: arXiv e-prints
ADS Bibcode: 2021arXiv211104703M
https://ieeexplore.ieee.org/document/9653097},
keywords = {Computer Science - Cryptography and Security, Computer Science - Machine Learning, Electrical Engineering and Systems Science - Signal Processing},
file = {Manhar Mohammed et al. - 2021 - HAPSSA Holistic Approach to PDF Malware Detection.pdf:/Users/tullsen/Zotero/storage/E7GHR3FI/Manhar Mohammed et al. - 2021 - HAPSSA Holistic Approach to PDF Malware Detection.pdf:application/pdf},
}
@article{kuribayashiStealthPDFDataHiding2021,
title = {{StealthPDF}: {Data} hiding method for {PDF} file with no visual degradation},
volume = {61},
issn = {2214-2126},
shorttitle = {{StealthPDF}},
url = {https://www.sciencedirect.com/science/article/pii/S2214212621001034},
doi = {10.1016/j.jisa.2021.102875},
abstract = {Conventional data hiding methods for PDF file insert a payload data by slightly modifying the position of characters in a document. Even if the changes are small, a certain degree of visual distortion is inevitably introduced to the PDF file. In this work, we propose a new data hiding method that splits the space value between characters. Specifically, a space value is split into two or more related values. Except for the first value which is reserved to store the corrective data, each of the related values encodes a segment of the payload data. When the PDF file is opened by a PDF viewer, the visual appearance is exactly the same as its original counterpart, i.e., complete quality preservation. To prevent direct observation of PDF file, access control is introduced by setting an owner password, which is a built-in function in the PDF standard. In the best case scenario, 38,160 bits can be hidden, while the observed file size increase is 12,776 Bytes.},
language = {English},
urldate = {2021-06-15},
journal = {Journal of Information Security and Applications},
author = {Kuribayashi, Minoru and Wong, KokSheik},
month = sep,
year = {2021},
keywords = {PDF, Authentication, Complete quality preservation, Data hiding, Space, StealthPDF},
pages = {102875},
}
@inproceedings{nicholasDocumentEngineeringIssues2021,
address = {New York, NY, USA},
series = {{DocEng} '21},
title = {Document engineering issues in malware analysis},
isbn = {978-1-4503-8596-1},
url = {https://doi.org/10.1145/3469096.3470950},
doi = {10.1145/3469096.3470950},
abstract = {We present an overview of the field of malware analysis with emphasis on issues related to document engineering. We will introduce the field with a discussion of the types of malware, including executable binaries, malicious PDFs, polymorphic malware, ransomware, and exploit kits. We will conclude with our view of important research questions in the field. This is an updated version of tutorials presented in previous years, with more information about newly-available tools.},
urldate = {2021-08-17},
booktitle = {Proceedings of the 21st {ACM} {Symposium} on {Document} {Engineering}},
publisher = {Association for Computing Machinery},
author = {Nicholas, Charles and Joyce, Robert J. and Simske, Steve},
month = aug,
year = {2021},
keywords = {disassembler, malware analysis, virtual machine},
pages = {1},
file = {Nicholas et al. - 2021 - Document engineering issues in malware analysis.pdf:/Users/tullsen/Zotero/storage/XMU6LNJP/Nicholas et al. - 2021 - Document engineering issues in malware analysis.pdf:application/pdf},
}
@phdthesis{diegoleonExtractingInformationPDF2021,
address = {Stockholm, Sweden},
type = {{DEGREE} {PROJECT} {COMPUTER} {SCIENCE} {AND} {ENGINEERING}, {SECOND} {CYCLE}, 30 {CREDITS}},
title = {Extracting {Information} {From} {PDF} {Invoices} {Using} {Deep} {Learning}},
url = {https://www.diva-portal.org/smash/get/diva2:1608779/FULLTEXT01.pdf},
abstract = {Manually extracting information from invoices can be time-consuming, especially
when managing large amounts of documents. Finding a way to automatically
extract this information could help businesses save resources. This thesis
investigates the information extraction of semi-structured data from PDF
invoices using deep learning methods and comparing them to a rule-based
model built as a baseline for comparison. More specifically, an object
detection approach based on the Faster R-CNN model is compared with a
Natural Language Processing (NLP) approach based on BERT. These models
were trained to extract 4 different fields, with a dataset consisting of 899 PDF
invoices. These models were tested on how well they extracted each field, and
their results were then compared. The NLP approach achieved the highest
overall F1 score of 0.911 and attained the highest score in all fields except
one. In second place came the rule-based approach, with an overall F1 score
of 0.830. In last place came the object detection approach with an overall
F1 score of 0.815. It is concluded that the NLP approach is best suited for
the task of information extraction from PDF invoices. Because of the small
dataset and Faster R-CNN requiring large amounts of data and long training,
the object detection approach did not reach its full potential. However, further
research is needed to prove if it could outperform the NLP approach with those
improvements.},
language = {English, Swedish},
school = {KTH ROYAL INSTITUTE OF TECHNOLOGY, SCHOOL OF ELECTRICAL ENGINEERING AND COMPUTER SCIENCE},
author = {{Diego Leon}},
month = aug,
year = {2021},
file = {Diego Leon - 2021 - Extracting Information From PDF Invoices Using Dee.pdf:/Users/tullsen/Zotero/storage/IPXBGJVV/Diego Leon - 2021 - Extracting Information From PDF Invoices Using Dee.pdf:application/pdf},
}
@article{giguetDanielFinTOC2021Taking2021,
title = {Daniel@{FinTOC}-2021: {Taking} {Advantage} of {Images} and {Vectorial} {Shapes} in {Native} {PDF} {Document} {Analysis}},
url = {https://aclanthology.org/2021.fnp-1.13.pdf},
abstract = {In this paper, we present our contribution to the FinTOC-2021 Shared Task “Financial Document Structure Extraction”. We participated in the tracks dedicated to English and French document processing. We get results for Title detection and TOC generation performance which demonstrates a good precision. We address the problem in a fairly unusual but ambitious way which consists in considering simultaneously text content, vectorial shapes and images embedded in the native PDF document, and to structure the document in its entirety.},
language = {English, French},
number = {The Third Financial Narrative Processing Workshop (FNP 2021)},
journal = {FinTOC-2021 Shared Task “Financial Document Structure Extraction”},
author = {Giguet, Emmanuel and Lejeune, Gaël},
month = sep,
year = {2021},
note = {http://wp.lancs.ac.uk/cfie/},
pages = {5},
file = {Giguet and Lejeune - Daniel@FinTOC-2021 Taking Advantage of Images and.pdf:/Users/tullsen/Zotero/storage/Y5WRI9NF/Giguet and Lejeune - Daniel@FinTOC-2021 Taking Advantage of Images and.pdf:application/pdf},
}
@inproceedings{guedesSupervisedLearningApproach2021,
address = {Cham},
series = {Lecture {Notes} in {Computer} {Science}},
title = {Supervised {Learning} {Approach} for {Section} {Title} {Detection} in {PDF} {Scientific} {Articles}},
volume = {13067},
isbn = {978-3-030-89817-5},
url = {https://link.springer.com/chapter/10.1007/978-3-030-89817-5_3},
doi = {10.1007/978-3-030-89817-5_3},
abstract = {The majority of scientific articles is available in Portable Document Format (PDF). Although PDF format has the advantage of preserving layout across platforms it does not maintain the original metadata structure, making it difficult further text processing. Despite different layouts, depending on the applied template, articles have a hierarchical structure and are divided into sections, which represent topics of specific subjects, such as methodology and results. Hence, section segmentation serves as an important step for a contextualized text processing of scientific articles. Therefore, this work applies binary classification, a supervised learning task, for section title detection in PDF scientific articles. To train the classifiers, a large dataset (more than 5 millions samples from 7,302 articles) was created through an automated feature extraction approach, comprised by 17 features, where 4 were introduced in this work. Training and testing were made for ten different classifiers for which the best F1 score reached 0.94. Finally, we evaluated our results against CERMINE, an open-source system that extracts metadata from scientific articles, having an absolute improvement in section detection of 0.19 in F1 score.},
language = {en},
booktitle = {Advances in {Computational} {Intelligence}},
publisher = {Springer International Publishing},
author = {Guedes, Gustavo Bartz and da Silva, Ana Estela Antunes},
editor = {Batyrshin, Ildar and Gelbukh, Alexander and Sidorov, Grigori},
month = oct,
year = {2021},
keywords = {Scientific article segmentation, Section title detection, Supervised learning, Text segmentation},
pages = {44--54},
}
@article{mikhailovPyTabbyDocreaderModule2021,
title = {{PyTabby}: a {Docreader}’s module for extracting text and tables from {PDF} with a text layer},
copyright = {Creative Commons License Attribution 4.0 International (CC BY 4.0)},
url = {http://ceur-ws.org/Vol-2984/paper15.pdf},
abstract = {This paper presents a complete solution for extraction of textual information and tables from PDF with a text layer. The presented solution consist of two parts: PyTabby is a tool for extracting text and tables from PDF with a complex background and layout, and Python wrapper module for Docreader tool. The PyTabby tool extracts text and tables from the low level representation of the PDF format. It enables employment of the additional information excluded in scanned documents and provides improvement of quality and performance compared with Optical Character Recognition (OCR) methods. The presented solution is incorporated into Docreader tool to parse PDF files with a text layer and is used as a part of the TALISMAN technology for social analytics.},
language = {English},
journal = {Information Technologies: Algorithms, Models, Systems (ITAMS)},
author = {Mikhailov, Andrey A and Shigarov, Alexey and Kozlov, Ilya S},
month = sep,
year = {2021},
pages = {7},
file = {Mikhailov et al. - PyTabby a Docreader’s module for extracting text .pdf:/Users/tullsen/Zotero/storage/Z8TGW93U/Mikhailov et al. - PyTabby a Docreader’s module for extracting text .pdf:application/pdf},
}
@misc{iccDocumentICC1A1999,
title = {Document {ICC}.{1A}:1999-04 {Addendum} 2 to {Specification} {ICC}.1:1998-09},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {ICC},
month = apr,
year = {1999},
file = {1999 - Document ICC.1A1999-04 Addendum 2 to Specificatio.PDF:/Users/tullsen/Zotero/storage/WG64HJ9V/1999 - Document ICC.1A1999-04 Addendum 2 to Specificatio.PDF:application/pdf},
}
@misc{iccSpecificationICC1998091998,
title = {Specification {ICC}.1:1998-09 {File} {Format} for {Color} {Profiles}},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = sep,
year = {1998},
file = {1998 - Specification ICC.11998-09 File Format for Color .PDF:/Users/tullsen/Zotero/storage/PPB98SK6/1998 - Specification ICC.11998-09 File Format for Color .PDF:application/pdf},
}
@misc{iccSpecificationICC1997081997,
title = {Specification {ICC}.1:1997-08 ({Version} 3.4) {File} {Format} for {Color} {Profiles}},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = aug,
year = {1997},
file = {1997 - Specification ICC.11997-08 (Version 3.4) File For.pdf:/Users/tullsen/Zotero/storage/7GLPKB77/1997 - Specification ICC.11997-08 (Version 3.4) File For.pdf:application/pdf},
}
@misc{iccSpecificationICC1996111996,
title = {Specification {ICC}.1:1996-11 ({Version} 3.3) {File} {Format} for {Color} {Profiles}},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {ICC},
month = nov,
year = {1996},
file = {1996 - Specification ICC.11996-11 (Version 3.3) File For.pdf:/Users/tullsen/Zotero/storage/6ASTM5LZ/1996 - Specification ICC.11996-11 (Version 3.3) File For.pdf:application/pdf},
}
@misc{iccSpecificationICC1995111995,
title = {Specification {ICC}.1:1995-11 ({Version} 3.2) {File} {Format} for {Color} {Profiles}},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = nov,
year = {1995},
file = {1995 - Specification ICC.11995-11 (Version 3.2) File For.pdf:/Users/tullsen/Zotero/storage/T47QG6MW/1995 - Specification ICC.11995-11 (Version 3.2) File For.pdf:application/pdf},
}
@misc{iccSpecificationICC1995051995,
title = {Specification {ICC}.1:1995-05 ({Version} 3.01) {File} {Format} for {Color} {Profiles}},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = may,
year = {1995},
file = {1995 - Specification ICC.11995-05 (Version 3.01) File Fo.pdf:/Users/tullsen/Zotero/storage/8KHNYDB2/1995 - Specification ICC.11995-05 (Version 3.01) File Fo.pdf:application/pdf},
}
@misc{iccSpecificationICC1994061994,
title = {Specification {ICC}.1:1994-06 ({Version} 3.0) {File} {Format} for {Color} {Profiles}},
copyright = {Copyright International Color Consortium},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = jun,
year = {1994},
file = {1994 - Specification ICC.11994-06 (Version 3.0) File For.pdf:/Users/tullsen/Zotero/storage/N7XE6RMV/1994 - Specification ICC.11994-06 (Version 3.0) File For.pdf:application/pdf},
}
@misc{iccICC2004102004,
title = {{ICC}.1:2004-10},
copyright = {Copyright International Color Consortium},
shorttitle = {{ICC}.1 v4.2.0},
url = {http://color.org/icc_specs2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {ICC},
month = oct,
year = {2004},
file = {2004 - ICC.12004-10.pdf:/Users/tullsen/Zotero/storage/ATX3TA3M/2004 - ICC.12004-10.pdf:application/pdf},
}
@misc{iccPrivateICCTag2019,
title = {Private and {ICC} {Tag} and {CMM} {Registry}},
copyright = {Copyright International Color Consortium},
shorttitle = {Tag {Registry}},
url = {http://www.color.org/signatures2.xalter},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = oct,
year = {2019},
file = {2019 - Private and ICC Tag and CMM Registry.pdf:/Users/tullsen/Zotero/storage/F6TRE5SD/2019 - Private and ICC Tag and CMM Registry.pdf:application/pdf},
}
@misc{iccSpecificationICC20192021,
title = {Specification {ICC}.2:2019 ({Profile} version 5.0.0.0) {Cumulative} {Errata} {List}},
url = {https://color.org/iccmax/ICC.2-2019_Cumulative_Errata_List_2021-09-09.pdf},
abstract = {Cumulative Errata List for iccMAX / ICC.2 / v5},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
month = sep,
year = {2021},
file = {ICC - 2021 - Specification ICC.22019 (Profile version 5.0.0.0).pdf:/Users/tullsen/Zotero/storage/BMGNE69H/ICC - 2021 - Specification ICC.22019 (Profile version 5.0.0.0).pdf:application/pdf},
}
@misc{iccWhitePaper472018,
title = {White {Paper} 47: {The} value of {iccMAX}},
copyright = {Copyright International Color Consortium},
shorttitle = {White {Paper} 47},
abstract = {iccMAX is a color management interchange format that addresses use cases beyond those addressed by the ICC v4 (ISO 15076-1) color management profile format. ICC v4 is widely used today in graphic arts workflows. For most of these workflows, v4 is straightforward to use and uniformly implemented across a large number of different software applications from different vendors. ICC v4 has enabled users to get the same or very similar results when color managing files through multiple different workflows, especially for the graphic arts.
In other applications, however such as managing digital photographs or color managing packaging in store lighting conditions, v4 is missing some key features. iccMAX evolved from work within the ICC to extend the v4 profile format beyond the graphic arts. iccMAX workflows are intended to be backward-compatible with v4, which means that iccMAX-aware applications also have to be able to use v4 profiles.
This paper is addressed to end users of color management systems, and is intended to be used to decide when an iccMAX rather than ICC v4 is the appropriate choice.},
language = {English},
publisher = {International Color Consortium},
author = {ICC},
month = may,
year = {2018},
file = {2018 - White Paper 47 The value of iccMAX.pdf:/Users/tullsen/Zotero/storage/JGC2SYBN/2018 - White Paper 47 The value of iccMAX.pdf:application/pdf},
}
@misc{iccSpecificationICC20192019,
title = {Specification {ICC}.2:2019 ({Profile} version 5.0.0 - {iccMAX}) {Image} technology color management - {Extensions} to architecture, profile format and data structure [{REVISION} of {ICC}.2:2018]},
copyright = {Copyright International Color Consortium},
shorttitle = {{ICC}.2 v5.0.0},
language = {English},
publisher = {International Color Consortium},
author = {{ICC}},
year = {2019},
file = {2019 - Specification ICC.22019 (Profile version 5.0.0 - .pdf:/Users/tullsen/Zotero/storage/CXIDVQ7P/2019 - Specification ICC.22019 (Profile version 5.0.0 - .pdf:application/pdf},
}
@misc{ferdZUGFeRD2019,
title = {{ZUGFeRD} 2.0.1},
shorttitle = {{ZUGFeRD} 2.0.1},
url = {https://www.ferd-net.de/zugferd/zugferd-2.0/index.html},
abstract = {The German Forum on electronic Invoicing (FeRD) has developed a new release of uniform data format called ZUGFeRD 2.0 on 11th March 2019. It can be used for exchanging invoice data between enterprises, authorities and consumers. The format allows for an exchange of structural invoice data between the issuer and the recipient in a single PDF file without any necessary further steps of reading or processing the data.
ZUGFeRD 2.0 was developed in close coordination with the French standard Factur-X 1.0, is technically identical to it and thus also pursues the standardization objectives at the European level. The hybrid invoice format contains the structured invoice data in a PDF / A-3 file that forms the view component of the invoice. The structured invoice data can be read out and processed by the invoice recipient.
ZUGFeRD 2.0 meets the requirements of the EU Directive and the EU standard.},
language = {German, English},
publisher = {Forum for Electronic Invoicing Germany (FeRD)},
author = {{FeRD}},
month = mar,
year = {2019},
note = {Can be downloaded free of charge from above URL.},
keywords = {Invoicing, PDF/A},
}
@article{suriLostMigrationDocument2018,
title = {Lost in migration: document quality for batch conversion to {PDF}/{A}},
volume = {39},
issn = {0737-8831},
shorttitle = {Lost in migration},
url = {https://doi.org/10.1108/LHT-10-2017-0220},
doi = {10.1108/LHT-10-2017-0220},
abstract = {Purpose Changes in file format specifications challenge long-term preservation of digital documents. Digital archives thus often focus on specific file formats that are well suited for long-term preservation, such as the PDF/A format. Since only few customers submit PDF/A files, digital archives may consider converting submitted files to the PDF/A format. The paper aims to discuss these issues. Design/methodology/approach The authors evaluated three software tools for batch conversion of common file formats to PDF/A-1b: LuraTech PDF Compressor, Adobe Acrobat XI Pro and 3-HeightsTM Document Converter by PDF Tools. The test set consisted of 80 files, with 10 files each of the eight file types JPEG, MS PowerPoint, PDF, PNG, MS Word, MS Excel, MSG and “web page.” Findings Batch processing was sometimes hindered by stops that required manual interference. Depending on the software tool, three to four of these stops occurred during batch processing of the 80 test files. Furthermore, the conversion tools sometimes failed to produce output files even for supported file formats: three (Adobe Pro) up to seven (LuraTech and 3-HeightsTM) PDF/A-1b files were not produced. Since Adobe Pro does not convert e-mails, a total of 213 PDF/A-1b files were produced. The faithfulness of each conversion was investigated by comparing the visual appearance of the input document with that of the produced PDF/A-1b document on a computer screen. Meticulous visual inspection revealed that the conversion to PDF/A-1b impaired the information content in 24 of the converted 213 files (11 percent). These reproducibility errors included loss of links, loss of other document content (unreadable characters, missing text, document part missing), updated fields (reflecting time and folder of conversion), vector graphics issues and spelling errors. Originality/value These results indicate that large-scale batch conversions of heterogeneous files to PDF/A-1b cause complex issues that need to be addressed for each individual file. Even with considerable efforts, some information loss seems unavoidable if large numbers of files from heterogeneous sources are migrated to the PDF/A-1b format.},
number = {2},
urldate = {2021-09-27},
journal = {Library Hi Tech},
author = {Suri, Roland Erwin and El-Saad, Mohamed},
month = jan,
year = {2018},
note = {Publisher: Emerald Publishing Limited},
keywords = {Academic libraries, Archives, Conversion, Digital documents, Digital libraries, Digital preservation},
pages = {337--351},
file = {Snapshot:/Users/tullsen/Zotero/storage/2TRUASN7/html.html:text/html},
}
@misc{ferdZUGFeRD2020,
title = {{ZUGFeRD} 2.1.1},
copyright = {Apache 2.0},
shorttitle = {{ZUGFeRD} 2.1.1},
url = {https://www.ferd-net.de/standards/zugferd-2.1.1/zugferd-2.1.1.html},
abstract = {The German Forum on electronic Invoicing (FeRD) has developed a new release of uniform data format called ZUGFeRD 2.0 on 11th March 2019. It can be used for exchanging invoice data between enterprises, authorities and consumers. The format allows for an exchange of structural invoice data between the issuer and the recipient in a single PDF file without any necessary further steps of reading or processing the data.
ZUGFeRD 2.0 was developed in close coordination with the French standard Factur-X 1.0, is technically identical to it and thus also pursues the standardization objectives at the European level. The hybrid invoice format contains the structured invoice data in a PDF / A-3 file that forms the view component of the invoice. The structured invoice data can be read out and processed by the invoice recipient.
ZUGFeRD 2.0 meets the requirements of the EU Directive and the EU standard.},
language = {German, English},
publisher = {Forum for Electronic Invoicing Germany (FeRD)},
author = {{FeRD}},
month = jul,
year = {2020},
note = {Can be downloaded free of charge from above URL.},
keywords = {Invoicing, PDF/A},
file = {ZUGFeRD-2.1.1 - Vergleich_ZUGFeRD1_ZUGFeRD21.pdf:/Users/tullsen/Zotero/storage/ZYT7UWNJ/ZUGFeRD-2.1.1 - Vergleich_ZUGFeRD1_ZUGFeRD21.pdf:application/pdf;ZUGFeRD-2.1.1 - Specification_TA_ReferenceProfiles.pdf:/Users/tullsen/Zotero/storage/97E65XLZ/ZUGFeRD-2.1.1 - Specification_TA_ReferenceProfiles.pdf:application/pdf;ZUGFeRD-2.1.1 - Specification_TA_Part-B.pdf:/Users/tullsen/Zotero/storage/6V944WSS/ZUGFeRD-2.1.1 - Specification_TA_Part-B.pdf:application/pdf;ZUGFeRD-2.1.1 - Specification_TA_Part-A.pdf:/Users/tullsen/Zotero/storage/YPUW6UBR/ZUGFeRD-2.1.1 - Specification_TA_Part-A.pdf:application/pdf;ZUGFeRD-2.1.1 - Specification_Known-Issues.pdf:/Users/tullsen/Zotero/storage/DSEBMZXC/ZUGFeRD-2.1.1 - Specification_Known-Issues.pdf:application/pdf;ZUGFeRD-2.1.1 - Specification_TA.pdf:/Users/tullsen/Zotero/storage/7XK2HXAD/ZUGFeRD-2.1.1 - Specification_TA.pdf:application/pdf;EN16931 code lists values used from 2020-02-14.xlsx:/Users/tullsen/Zotero/storage/ZQ3N9HZM/EN16931 code lists values used from 2020-02-14.xlsx:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;ZUGFeRD-2.1.1 - Specification.pdf:/Users/tullsen/Zotero/storage/DFDQLTGT/ZUGFeRD-2.1.1 - Specification.pdf:application/pdf},
}
@misc{ferdOrderX2021,
title = {Order-{X} 1.0},
copyright = {Apache 2.0},
shorttitle = {Order-{X} 1.0},
url = {https://www.ferd-net.de/standards/download-packages/order-x-1.0.html},
abstract = {The German Forum on electronic Invoicing (FeRD) has developed a new release of uniform data format called Order-X. Order-X is the new hybrid format for the digitalized workflow of creating and processing orders. It is built on the same semantic data model as the hybrid format for electronic invoices, ZUGFeRD/Factur-X. The main advantage of such a hybrid format is it allows to replace traditional simple PDF or paper orders exchange by PDF orders with structured data for automated processing by companies which are willing or able to do so, especially SMEs. In addition, it is now possible to establish a seamless and collaborative process from order to invoice, in conjunction with ZUGFeRD/Factur-X, as both formats are based on the same XML structure.
Order-X is the hybrid equivalent to the electronic invoice standard ZUGFeRD/Factur-X, allowing to process purchase orders electronically. It is based on the sam international data model of the United Nations "UN/CEFACT Supply Chain Reference Data Model" (SCRDM) thus ensuring interoperability with ZUGFeRD/Factur-X.
In its hybrid version, Order-X embeds the structured information in a PDF/A-3 file which ist he visual representation of the order. The structured XML information can then be automatically processed by the receiver.},
language = {English, German},
publisher = {Forum for Electronic Invoicing Germany (FeRD)},
author = {FeRD},
month = apr,
year = {2021},
file = {01-2021 04 13 - FeRD-FNFE Cross Industry ORDER-X Process Specification V1.0.pdf:/Users/tullsen/Zotero/storage/BTACNBHA/01-2021 04 13 - FeRD-FNFE Cross Industry ORDER-X Process Specification V1.0.pdf:application/pdf;Electronic_Invoices_Practical_Guidelines_for_Companies.pdf:/Users/tullsen/Zotero/storage/IF7C2LLK/Electronic_Invoices_Practical_Guidelines_for_Companies.pdf:application/pdf},
}
@misc{aiimAIIMBP022008Best2008,
title = {{AIIM} {BP}-02-2008 {Best} {Practices} - {Implementation} guide for the {Portable} {Document} {Format} {Healthcare} ({PDF}/{H})},
copyright = {Copyright AIIM},
shorttitle = {{PDF}/{H}},
url = {https://issuu.com/michaelejahn/docs/pdf-h_implementation_guide_2008/49},
abstract = {Portable Document Format (PDF) is a digital file format that provides a method for presenting information that is independent of the application software, hardware, and operating system used to create the information and of the output device used to display or print the information. The independent nature of PDF facilitates the process of creating, managing, securing, collecting, and exchanging digital content on diverse platforms and devices. As such, the use of PDF provides the basis for information portability and interoperability. The migration of multiple medical record types to a universal digital format would be enabled by implementation of an easily adopted document encapsulation practice. This practice would contain specifications for portability, interoperability, and security and would promote the exchange of healthcare information.
The Portable Document Format Healthcare, A Best Practices Guide (AIIM/ASTM BP-01-2008) is designed as an aide to help describe the technical aspects of such a practice. This companion Implementation Guide is meant to be an adjunct to the Best Practices Guide and provides sample implementation information.
The Portable Document Format Healthcare, A Best Practices Guide describes the practice of using PDF to facilitate a trusted means by which healthcare information is captured, exchanged, preserved, and protected among consumers and the rest of the healthcare system.
As an adjunct to the Best Practices Guide, this Implementation Guide is intended to serve as an example for implementation and does not preclude development or implementation of any other features or implementations of PDF Healthcare through additional usage models in support of storing, exchanging, sharing, or viewing of personal healthcare data.
This Implementation Guide recognizes that there is a continuum of implementation models ranging from large enterprise models, to independent software vendor models, small to medium sized healthcare provider offices, enterprise to physician, physician to enterprise, physician to physician, physician to other healthcare provider, patient to physician, patient to other healthcare provider, and more. This Implementation Guide cannot describe every use case but will provide a framework that can be replicated in a wide variety of instances.},
language = {English},
publisher = {AIIM},
author = {{AIIM}},
year = {2008},
keywords = {PDF/H},
}
@misc{aiimAIIMASTMBP0120082008,
title = {{AIIM}/{ASTM} {BP}-01-2008 {Portable} {Document} {Format} {Healthcare}: {A} {Best} {Practices} {Guide}},
copyright = {Copyright ASTM},
shorttitle = {{PDF}/{H}},
url = {https://www.astm.org/Standards/AIIMASTM.htm},
abstract = {This Portable Document Format Healthcare Best Practices Guide describes the features and functions of the proposed, voluntary, and industry-wide use of the Portable Document Format (PDF) for the healthcare industry. As such, the guide is intended to be used as a reference tool for defining PDF as an electronic container by which healthcare information can be captured, exchanged, preserved, and protected for consumers, care providers, and other stakeholders within the healthcare delivery system.
The guide does not describe a normative file format in the same manner as PDF/A. Instead the guide provides education on the use of PDF to support extensible Markup Language (XML) standards in the healthcare ecosystem and enable longer-term retention of PDF healthcare documents.
Note: This document is not Intended to address compliance with any applicable state and federal regulations that might apply to this information, including Public Law 104· 191 (1996), the Health Insurance Portability and Accountability Act (HIPAA).
The Implementation Gulde for PDF Healthcare is published as a separate document by AIIM and can be found at: http J/wvm.ai1m.org/pdfh/ig},
language = {English},
publisher = {AIIM},
author = {AIIM},
month = feb,
year = {2008},
note = {Specification can be purchased via ASTM webstore.},
keywords = {PDF/H},
file = {2008 - AIIMASTM BP-01-2008 Portable Document Format Heal.pdf:/Users/tullsen/Zotero/storage/6DEZS6WC/2008 - AIIMASTM BP-01-2008 Portable Document Format Heal.pdf:application/pdf},
}
@misc{hpPCLmPCLmS2013,
title = {{PCLm} / {PCLmS}},
shorttitle = {{PCLm}},
abstract = {PCLm is an HP proprietary format for streaming mobile printing which is derived from PDF. PCLm was originally released by HP as part of e-Print and was later officially incorporated the WiFi Alliance WiFi Direct Print Services and promoted by Mopria as one of 3 PDLs for mobile printing (PCLm, PDF, PWG Raster).
The PCLm Specification is currently only available to WiFi Alliance Members. HP US patent application US 20130100486 A1 “Communication architectures for direct printing and scanning” discloses information on PCLm:
To create a streamable PDF document, a new protocol called Printer Control Language—Mobile (PCLm) has been created. PCLm has been developed using a subset of the PDF grammar. PCLm allows for consumption of the print files by all PDF 1.4 or later compliant devices and allows for both printing and viewing of the rasterized pages. PCLm may exclude a number of PDF grammar constructs including text, vector, images, patterns, transparency and blending instructions. The exclusion of these commands makes PCLm deterministic because commands that could be retroactively applied to data which has already been streamed have been eliminated. This allows printers to begin printing as soon as the PCLm data stream begins to be received. This minimizes the amount of memory and computation required by the printer and allows low end printers to effectively print large documents.
PCLm is device independent because PCLm supports standard and device independent imaging constructs. Image data is in contone space (8 bit gray and 24 bit RGB) and has not been altered in any device dependent way. PCLm is also minimalistic because it is designed to be lightweight and efficient. This allows low-end consuming devices with limited processing power and memory to consume PCLm files. However, the configurability of the language also enables high-end devices to improve throughput and performance.
As used in the specification and appended claims, the term “streamable” refers to print files whose data is not retroactively altered by subsequent data in the file. This allows the print files to be streamed to the printer and printing to begin prior to receiving the entire print file because portions of the print file that are received by the printer are not modified by subsequent data. Consequently, the printer can begin printing once a predetermined amount of data has been received. For example, after receiving a swath or page, the printing can commence because subsequent data will not retroactively alter the data already received. In some embodiments, the print files may be streamable in that each bit of data representing a portion of an image to be printed is not altered once it is received by the printer.
As used in the specification and appended claims, the term “deterministic” refers to files, protocols and techniques that produce print jobs (payload) of known size and complexity. This guarantees that the printer will always be able to print the job without causing a memory out or a performance issue. For example, a print file that uses retroactive commands is not deterministic because the complete print file must be received, stored and manipulated by the printer. Because print files have varying sizes (sometimes as large as hundreds of megabytes), there is no guarantee that the printer will have enough memory or processing power to print the job.
PCLm is viewable by all standard PDF viewers because it is constructed with a proper subset of PDF. Some PCLm language constructs are added to the grammar to facilitate viewing. These additional constructs can be ignored by consuming devices. PCLm documents are also archivable because they are constructed so that they can be stored and re-printed at later times.
Additionally, PCLm is predictable and deterministic in its rate of document consumption and page production. Some performance variability can be incurred by the size and complexity of the input page image, but this will not cause significant throughput degradation. PCLm formatted documents are streamable because all the content for imaging a page is delivered by the end of that page's description within the PCLm stream. Raster data contained within the page is delivered in a page-logical-top to page-logical-bottom, thus allowing trivial consumption by the consumable device. To enhance page streamability in low memory devices, PCLm can support segmentation of individual pages into strips or swaths. The height of the strips will be device dependent and discovered during the IPP device capabilities query. This process may reduce the amount of memory required to buffer the raster page.
The printer status, job control, and job transmission are performed using Internet Printing Protocol (IPP). IPP provides a standard network protocol for remote printing and managing print jobs, media size, resolution, etc. IPP is implemented using Hypertext Transfer Protocol (HTTP) and inherits all of the HTTP streaming and security features, including encryption protocols such as Transport Layer Security (TLS) and HTTPS. IPP allows clients to query a printer's capabilities, submit print jobs, query the status of a printer, query the status of print jobs, cancel previously submitted jobs, and other actions.
Some PCLm devices have limited resources and thus cannot perform all necessary raster and page-order transformations to support duplex printing. Therefore, PCLm devices may require that the client develop the raster for the backside of duplex jobs so that it is oriented (flipped and or mirrored) correctly for the consuming device and possibly reorder the pages. The client computing device can discover the requirements of the consuming device by the IPP attribute backside-orientation and backside-scan-direction. PCLm can support a number of scanline orientations for backside duplex pages including: bottom-to-top or top-to-bottom, right-to-left or left-to-right, and frontside-first or backside-first. These scan line orientations are noted in the file and may not adversely affect the viewing of the file.
Other websites also describe “PCLm/PCLmS” as streaming mobile formats. For example http://en.wikipedia.org/wiki/Talk\%3APrinter\_Command\_Language:
What is unique about PCLm/PCLmS is that it contains Job Ticketing metadata which is very important in future mobile printing applications. So, you will start seeing many more printers that will list support for PCLm/PCLmS format. The other format you will start seeing is PWG format, which also includes job ticketing information. Both of these raster formats will be used by mobile devices to minimize the code required to generate a print stream that could be printed to virtually any printer connected to the Cloud or easily viewed on any supporting device. There is very little information about PCLm/PCLmS format. But, PWG is very well documented and its positioning is explained at the W3 Printer Work Group website: https://www.pwg.org/. PageTech's PCLReader, PCLWorks and PCLTool SDK products all have the ability to view/transform both the PCLm/PCLmS and PWG formats including the job ticketing metadata. They can also convert PCL with whatever job ticketing data is available in PCL or PJL format into PCLm/PCLmS and PWG formats. Job ticketing information (copy count, duplexing, stapling, color, etc.) is used to determine which printers are available that can print your file.},
language = {English},
publisher = {Hewlett Packard},
author = {{HP}},
year = {2013},
file = {UPnP_File_Transfer_Service_Technical_Specification_v1.1.pdf:/Users/tullsen/Zotero/storage/KMFCJWSV/UPnP_File_Transfer_Service_Technical_Specification_v1.1.pdf:application/pdf;Wi-Fi_Peer-to-Peer_Services_Print_Technical_Specification_v1.1.pdf:/Users/tullsen/Zotero/storage/3D5CDALD/Wi-Fi_Peer-to-Peer_Services_Print_Technical_Specification_v1.1.pdf:application/pdf;Wi-Fi_Peer-to-Peer_Services_Technical_Specification_v1.2.pdf:/Users/tullsen/Zotero/storage/JJMV64A2/Wi-Fi_Peer-to-Peer_Services_Technical_Specification_v1.2.pdf:application/pdf},
}
@misc{ferdZUGFeRD2014,
title = {{ZUGFeRD} 1.0},
shorttitle = {{ZUGFeRD} 1.0},
url = {https://www.ferd-net.de/zugferd/zugferd-1.0/index.html},
abstract = {ZUGFeRD 1.0 (June 2014) is a German standard based on PDF/A-3 that allows the blind exchange of invoices between supplier and payer without any requirements for prior arrangements. ZUGFeRD invoices can be deployed universally and are not limited to specific industry sectors or company sizes (unlike EDI). Private enterprises as well as public administration can efficiently organize their invoice processing with ZUGFeRD. The standard has been created by a working group which comprises members from the public administration, three German federal ministries, industry associations in the financial, tax and software sectors, and other organizations.
ZUGFeRD invoices carry both a human-readable representation (rendering) of the invoice as well as a structured machine-readable XML representation based on “Core Cross Industry Invoice” (CII) developed by UN/CEFACT. The CII provides a large framework with more than 2000 elements. The framework is modelled around business processes and relationships. The human-readable rendering is encoded as one or more PDF pages according to the PDF/A standard. In order to associate both invoice renderings with each other, ZUGFeRD leverages an important feature of PDF/A-3 (ISO 19005-3) which allows embedded attachments of arbitrary types into the PDF/A document. The XML invoice data is embedded in the PDF document as an attachment according to PDF/A-3 “Associated File” feature. In other words, ZUGFeRD invoices contain two separate renderings of the invoice where PDF/A-3 serves both as one of the renderings as well as the container for the XML CII rendering.
See http://www.pdflib.com/knowledge-base/pdfa/zugferd-invoices/},
language = {German, English},
publisher = {Forum for Electronic Invoicing Germany (FeRD)},
author = {FeRD},
month = jun,
year = {2014},
note = {Can be downloaded free of charge from above URL.},
keywords = {Invoicing, PDF/A},
}
@misc{vdaVDA49532Drawingfree2015,
title = {{VDA} 4953-2 {Drawing}-free {Product} {Documentation}},
shorttitle = {{VDA} 4953-2},
url = {https://www.vda.de/en/services/Publications/drawing-free-product-documentation.html},
abstract = {The first part of VDA Recommendation 4953 described working with simplified drawings combined with 3D models and a master data sheet. Part 2 of the Recommendation describes the means available to the automotive industry for producing product documentation within a drawing-free process (DFP)
Available languages: German, English},
language = {English},
publisher = {Verband der Automobilindustrie (VDA)},
author = {{VDA}},
month = mar,
year = {2015},
keywords = {PDF/A},
file = {2015 - VDA 4953-2 Drawing-free Product Documentation.pdf:/Users/tullsen/Zotero/storage/HTKISTUY/2015 - VDA 4953-2 Drawing-free Product Documentation.pdf:application/pdf},
}
@article{zhangjiandongExtractingPDFTables2021,
title = {Extracting {PDF} {Tables} {Based} on {Word} {Vectors}},
volume = {5},
issn = {2096-3467},
url = {http://manu44.magtech.com.cn/Jwk_infotech_wk3/EN/abstract/abstract5140.shtml},
doi = {10.11925/infotech.2096-3467.2021.0164},
language = {cn},
number = {8},
urldate = {2021-09-26},
journal = {Data Analysis and Knowledge Discovery},
author = {Zhang Jiandong, Chen Shiji and Zhang Jiandong, Chen Shiji},
month = sep,
year = {2021},
pages = {34--44},
file = {Zhang Jiandong and Zhang Jiandong - 2021 - Extracting PDF Tables Based on Word Vectors.pdf:/Users/tullsen/Zotero/storage/FQ6AHH9E/Zhang Jiandong and Zhang Jiandong - 2021 - Extracting PDF Tables Based on Word Vectors.pdf:application/pdf},
}
@article{damoreTraitorProofPDFWatermarking2021,
title = {Traitor-{Proof} {PDF} {Watermarking}},
url = {http://arxiv.org/abs/2109.09712},
abstract = {This paper presents a traitor-tracing technique based on the watermarking of digital documents (pdf files in particular). The watermarking algorithm uses a chain of three separate techniques that work in synergy. The embedded payload can withstand a wide range of attacks and cannot be removed without invalidating the credibility of the document.},
language = {en},
urldate = {2021-09-26},
journal = {arXiv:2109.09712 [cs]},
author = {d'Amore, Fabrizio and Serpi, Alessandro},
month = sep,
year = {2021},
note = {arXiv: 2109.09712},
keywords = {Computer Science - Cryptography and Security},
annote = {Comment: 23 pages LNCS formatted, 3 figures. submitted to workshop},
file = {d'Amore and Serpi - 2021 - Traitor-Proof PDF Watermarking.pdf:/Users/tullsen/Zotero/storage/VYIDKVZR/d'Amore and Serpi - 2021 - Traitor-Proof PDF Watermarking.pdf:application/pdf},
}
@article{fayyazAccessibilityTablesPDF2021,
title = {Accessibility of {Tables} in {PDF} {Documents}},
volume = {40},
issn = {2163-5226, 0730-9295},
url = {https://ejournals.bc.edu/index.php/ital/article/view/12325},
doi = {10.6017/ital.v40i3.12325},
abstract = {People access and share information over the web and in other digital environments, including digital libraries, in the form of documents such as books, articles, technical reports, etc. These documents are in a variety of formats, of which the Portable Document Format (PDF) is most widely used because of its emphasis on preserving the layout of the original material. The retrieval of relevant material from these derivative documents is challenging for information retrieval (IR) because the rich semantic structure of these documents is lost. The retrieval of important units such as images, figures, algorithms, mathematical formulas, and tables becomes a challenge. Among these elements, tables are particularly important because they can add value to the resource description, discovery, and accessibility of documents not only on the web but also in libraries if they are made retrievable and presentable to readers. Sighted users comprehend tables for sensemaking using visual cues, but blind and visually impaired users must rely on assistive technologies, including textto-speech and screen readers, to comprehend tables. However, these technologies do not pay sufficient attention to tables in order to effectively present tables to visually impaired individuals. Therefore, ways must be found to make tables in PDF documents not only retrievable but also comprehensible. Before developing such solutions, it is necessary to review the available assistive technologies, tools, and frameworks for their capabilities, strengths, and limitations from the comprehension perspective of blind and visually impaired people, along with suitable environments like digital libraries. We found no such review article that critically and analytically presents and evaluates these technologies. To fill this gap in the literature, this review paper reports on the current state of the accessibility of PDF documents, digital libraries, assistive technologies, tools, and frameworks that make PDF tables comprehensible and accessible to blind and visually impaired people. The study findings have implications for libraries, information sciences, and information retrieval.},
language = {English},
number = {3},
urldate = {2021-09-26},
journal = {Information Technology and Libraries},
author = {Fayyaz, Nosheen and Khusro, Shah and Ullah, Shakir},
month = sep,
year = {2021},
file = {Fayyaz et al. - 2021 - Accessibility of Tables in PDF Documents.pdf:/Users/tullsen/Zotero/storage/C7NBM4YN/Fayyaz et al. - 2021 - Accessibility of Tables in PDF Documents.pdf:application/pdf},
}
@article{ullrichRealworldStringComparison2021,
title = {Real-world {String} {Comparison}: {How} to handle {Unicode} sequences correctly},
volume = {19},
issn = {1542-7730},
shorttitle = {Real-world {String} {Comparison}},
url = {https://doi.org/10.1145/3475965.3478522},
doi = {10.1145/3475965.3478522},
abstract = {In many languages a string comparison is a pitfall for beginners. With any Unicode string as input, a comparison often causes problems even for advanced users. The semantic equivalence of different characters in Unicode requires a normalization of the strings before comparing them. This article shows how to handle Unicode sequences correctly. The comparison of two strings for equality often raises questions concerning the difference between comparison by value, comparison of object references, strict equality, and loose equality. The most important aspect is semantic equivalence.},
language = {English},
number = {3},
urldate = {2021-09-14},
journal = {Queue},
author = {Ullrich, Torsten},
month = jun,
year = {2021},
note = {https://queue.acm.org/detail.cfm?id=3478522},
pages = {Pages 50:107--Pages 50:116},
file = {Ullrich - 2021 - Real-world String Comparison How to handle Unicod.pdf:/Users/tullsen/Zotero/storage/H8XUDJC5/Ullrich - 2021 - Real-world String Comparison How to handle Unicod.pdf:application/pdf},
}
@misc{j.reynaPotential360Degree2021,
title = {The {Potential} of 360 {Degree} {Videos} - {PDF} {Flipbook}},
url = {https://abox.pub/the-potential-of-360-degree-videos-pdf-flipbook.html},
abstract = {Cutting-edge video technologies have the potential to impact teaching, learning and research by providing more efficient, flexible and immerse experiences. In the early 90s, Apple developed QuickTime Virtual Reality (QTVR), and it can be considered an inspiration for 360-degree videos.
QuickTime VR technology used a series of pictures and stitched them together cylindrically (images wrapped around the viewer) using a QuickTime movie file. Users were able to scroll up and down, right and left, zoom in and out and even click links that contained audio or pop-up windows. In the late 90s, applications such as PanoViewer were developed using Flash that has similar functionality. With the mobile phone (2007) and tablet revolution (2010), these applications became redundant, and mobile applications started to offer VR experiences. Regrettably; it never has a massive uptake for education nor the general public. Twenty years later, the 360-degree video cameras were introduced, and YouTube support for 360-degree videos started (2015). Currently, there are more than twenty 360-degree video camera brands on the market. The growth of action cameras and applications may inspire this trend. This paper covers the technical side of 360-degree videos and discusses their potential application for teaching, learning and research.},
language = {English},
urldate = {2021-09-14},
journal = {abox.pub},
author = {{J. Reyna}},
month = apr,
year = {2021},
note = {Lecture in Higher Education, Learning Design, Digital Media for Learning Scholar, Faculty of Science, University of Technology Sydney (AUSTRALIA)},
file = {Snapshot:/Users/tullsen/Zotero/storage/UWGMBVZS/the-potential-of-360-degree-videos-pdf-flipbook.html:text/html},
}
@inproceedings{nayanBanglaPDFSpeaker2021,
title = {Bangla {PDF} {Speaker} : {A} {Complete} {Computer} {Application} to {Convert} {Bangla} {PDF} to {Speech}},
shorttitle = {Bangla {PDF} {Speaker}},
doi = {10.1109/ACMI53878.2021.9528221},
abstract = {In this paper, a complete computer application is presented that can convert Bangla PDF to Bangla Speech. According to the proposed technique, images are extracted from PDF and then after processing the images, they are sent to OCR engine to extract text. Extracted text are then normalized and sent to text to speech (TTS) engine to generate speech. Image processing is a key component of the developed application as it increases the efficiency of OCR engine to a great extent. We propose a novel threshold selection method that is able to detect type of noise in the extracted image and select threshold accordingly for binary transformation. Thus it solves the problem of selecting appropriate threshold of different images and it increases the overall accuracy and efficiency of the application. Another feature that has improved the performance of introduced computer application is text normalization. Normalization of the extracted text from the OCR engine makes the text more accurate to pronounce by the TTS engine depending on the context. Finally, we present experimental results that show 80.804\% accuracy on text extraction from the PDF file and 3.92 score (out of 5) on the generated speech by human evaluation.},
booktitle = {2021 {International} {Conference} on {Automation}, {Control} and {Mechatronics} for {Industry} 4.0 ({ACMI})},
author = {Nayan, Md. Mizanur Rahaman and Haque, Mohammad Ariful},
month = jul,
year = {2021},
keywords = {Autonomous binarization threshold selection, Computer applications, Feature extraction, image processing, Mechatronics, Neural networks, Optical character recognition software, PDF to speech, Process control, text extraction, text normalization, text to speech, Training},
pages = {1--5},
file = {IEEE Xplore Abstract Record:/Users/tullsen/Zotero/storage/V8A2KTFQ/9528221.html:text/html},
}
@inproceedings{mishraExtractionTheoremsProofs2021,
address = {New York, NY, USA},
series = {{DocEng} '21},
title = {Towards extraction of theorems and proofs in scholarly articles},
isbn = {978-1-4503-8596-1},
url = {https://doi.org/10.1145/3469096.3475059},
doi = {10.1145/3469096.3475059},
abstract = {Scholarly articles in mathematical fields often feature mathematical statements (theorems, propositions, etc.) and their proofs. In this paper, we present preliminary work for extracting such information from PDF documents, with several types of approaches: vision (using YOLO), natural language (with transformers), and styling information (with linear conditional random fields). Our main task is to identify which parts of the paper to label as theorem-like environments and proofs. We rely on a dataset collected from arXiv, with LATeX sources of research articles used to train the models.},
urldate = {2021-08-17},
booktitle = {Proceedings of the 21st {ACM} {Symposium} on {Document} {Engineering}},
publisher = {Association for Computing Machinery},
author = {Mishra, Shrey and Pluvinage, Lucas and Senellart, Pierre},
month = aug,
year = {2021},
keywords = {information extraction, proofs, scholarly articles, theorems},
pages = {1--4},
file = {Mishra et al. - 2021 - Towards extraction of theorems and proofs in schol.pdf:/Users/tullsen/Zotero/storage/Q75EXHAT/Mishra et al. - 2021 - Towards extraction of theorems and proofs in schol.pdf:application/pdf},
}
@inproceedings{pradhanPDFTextSentiment2021,
address = {Singapore},
series = {Advances in {Intelligent} {Systems} and {Computing}},
title = {{PDF} {Text} {Sentiment} {Analysis}},
isbn = {9789811625947},
doi = {10.1007/978-981-16-2594-7_55},
abstract = {Nowadays the internet has become a great source in terms of unstructured data. In the sentiment inspection, unprocessed text is operated, and it has brought different issues in computer processing. To avoid such issues, various steps and tactics are done. The paper gives an insight into the ground of sentiment inspection targeting today’s analysis works—lexicon-based work, context less categorization, and deep analysis. Sentiment mining, a main newbie subcategory in inspection, is discussed in this project. The main objective of the project is to describe a brief introduction to this emerging issue and to represent complete research of all major survey problems and the present increment in the ground. As a proof of that, this project requires greater than 400 links from all important journals. However the ground works with the natural language text, which is generally counted in the unprocessed data, this project has done a structured line or option in describing the difficulty with the objective of linking the unprocessed and processed ground and doing qualitative and quantitative analysis of emotions. It is major and important for practical applications.},
language = {English},
booktitle = {International {Conference} on {Innovative} {Computing} and {Communications}},
publisher = {Springer},
author = {Pradhan, Rahul and Gangwar, Kushagra and Dubey, Ishika},
editor = {Khanna, Ashish and Gupta, Deepak and Bhattacharyya, Siddhartha and Hassanien, Aboul Ella and Anand, Sameer and Jaiswal, Ajay},
month = aug,
year = {2021},
keywords = {PDF, Multilingual, Sentiment analysis},
pages = {679--690},
}
@article{baeLearn2EvadeLearningbasedGenerative2021,
title = {{Learn2Evade}: {Learning}-based {Generative} {Model} for {Evading} {PDF} {Malware} {Classifiers}},
issn = {2691-4581},
shorttitle = {{Learn2Evade}},
url = {https://ieeexplore.ieee.org/abstract/document/9512394},
doi = {10.1109/TAI.2021.3103139},
abstract = {Recent research has shown that a small perturbation to an input may forcibly change the prediction of a machine learning (ML) model. Such variants are commonly referred to as adversarial examples. Early studies have focused mostly on ML models for image processing and expanded to other applications, including those for malware classification. In this paper, we focus on the problem of finding adversarial examples against ML-based PDF malware classifiers. We deem that our problem is more challenging than those against ML models for image processing because of the highly complex data structure of PDF and of an additional constraint that the generated PDF should exhibit malicious behavior. To resolve our problem, we propose a variant of generative adversarial networks (GANs) that generate evasive variant PDF malware (without any crash), which can be classified as benign by various existing classifiers yet maintaining the original malicious behavior. Our model exploits the target classifier as the second discriminator to rapidly generate an evasive variant PDF with our new feature selection process that includes unique features extracted from malicious PDF files. We evaluate our technique against three representative PDF malware classifiers (Hidost13, Hidost16, and PDFrate-v2) and further examine its effectiveness with AntiVirus engines from VirusTotal. To the best of our knowledge, our work is the first to analyze the performance against the commercial AntiVirus engines. Our model finds, with great speed, evasive variants for all selected seeds against state-of-the-art PDF malware classifiers and raises a serious security concern in the presence of adversaries.},
journal = {IEEE Transactions on Artificial Intelligence},
author = {Bae, Ho and Lee, Younghan and Kim, Yohan and Hwang, Uiwon and Yoon, Sungroh and Paek, Yunheung},
month = aug,
year = {2021},
note = {Conference Name: IEEE Transactions on Artificial Intelligence},
keywords = {Portable document format, Feature extraction, Training, Adversarial Examples, Artificial intelligence, Detectors, Evading PDF Classifiers, Generative Adversarial Networks, Malware, PDF Malware, Perturbation methods},
pages = {1--1},
file = {Bae et al. - 2021 - Learn2Evade Learning-based Generative Model for E.pdf:/Users/tullsen/Zotero/storage/6HB3GHHC/Bae et al. - 2021 - Learn2Evade Learning-based Generative Model for E.pdf:application/pdf},
}
@article{evansUsePDFDigital2014,
title = {The {Use} of {PDF}/{A} in {Digital} {Archives}: {A} {Case} {Study} from {Archaeology}},
volume = {9},
copyright = {Copyright (c)},
issn = {1746-8256},
shorttitle = {The {Use} of {PDF}/{A} in {Digital} {Archives}},
url = {http://www.ijdc.net/article/view/9.2.123?sid=SCITRUS},
doi = {10.2218/ijdc.v9i2.267},
abstract = {In recent years the Portable Document Format (PDF) has become a ubiquitous format in the exchange of documents; in 2005 the PDF/A profile was defined in order to meet long term accessibility needs, and has accordingly come to be regarded as a long-term archiving strategy for PDF files. In the field of archaeology, a growing number of PDF files – containing the detailed results of fieldwork and research – are beginning to be deposited with digital archives such as the Archaeology Data Service (ADS). In the ADS’ experience, the use of PDF/A has had benefits as well as drawbacks: the majority of PDF reports are now in a standard format better suited to longer-term access, however migrating to PDF/A and managing and ensuring reuse of these files is intensive, and fraught with potential pitfalls. Of these, perhaps the most serious has been an unreliability in PDF/A conformance by the wide range of tools and software now available. There are also practical and more theoretical implications for reuse which, as our discipline of archaeology alongside so many others rapidly becomes digitized, presents us with a large corpus of ‘data’ that is human readable, but may not be amenable to machine-based technologies such as NLP. It may be argued that these factors effectively undermine some of the perceived cost benefit of moving from paper to digital, as well as the longer-term sustainability of PDF/A within digital archives.},
language = {en},
number = {2},
urldate = {2021-08-13},
journal = {International Journal of Digital Curation},
author = {Evans, Tim N. L. and Moore, Ray H.},
month = oct,
year = {2014},
note = {Number: 2},
keywords = {curation, DCC, digital curation, digital preservation, IJDC, International Journal of Digital Curation, preservation},
pages = {123--138},
file = {Evans and Moore - 2014 - The Use of PDFA in Digital Archives A Case Study.pdf:/Users/tullsen/Zotero/storage/ZKVTCXGH/Evans and Moore - 2014 - The Use of PDFA in Digital Archives A Case Study.pdf:application/pdf},
}
@book{mccargarNewspapersDataFormats2011,
title = {Newspapers, data formats, and acronym stew: {Preservation} and distribution of born-digital newspapers using {METS}/{ALTO}, {NITF}, and {PDF}-{A}},
isbn = {978-3-11-025531-7},
shorttitle = {Newspapers, data formats, and acronym stew},
url = {https://www.degruyter.com/document/doi/10.1515/9783110255317.115/html?sid=SCITRUS},
abstract = {Newspapers, data formats, and acronym stew: Preservation and distribution of born-digital newspapers using METS/ALTO, NITF, and PDF-A was published in Newspapers on page 115.},
language = {en},
urldate = {2021-08-13},
publisher = {De Gruyter Saur},
author = {McCargar, Victoria and Nadal, Jacob and Snyder, Henry and Vanek, Andrea and Zarndt, Frederick},
month = may,
year = {2011},
note = {Pages: 115-124
Publication Title: Newspapers
Section: Newspapers},
file = {Snapshot:/Users/tullsen/Zotero/storage/3DZD2M5F/html.html:text/html},
}
@article{sullivanArchivalRecordsManagement2006,
title = {An archival/records management perspective on {PDF}/{A}},
volume = {16},
issn = {0956-5698},
url = {https://doi.org/10.1108/09565690610654783},
doi = {10.1108/09565690610654783},
abstract = {Purpose – This article sets out to explain the purpose of PDF/A, how it addresses archival and records management concerns, how PDF/A was designed to have “desirable properties of a long‐term preservation format”, and the future of PDF/A. Design/methodology/approach – The contents of this article are based on the author's knowledge and experience of the subject. Findings – It is emphasized that PDF/A must be implemented in conjunction with policies and procedures, including quality assurance procedures to ensure acceptable replication of source material. Originality/value – This article will be of interest to anyone working with PDF files. Work has already begun on PDF/A Part 2 which will be based on PDF 1.6. Application notes and a listing of frequently asked questions will be made publicly available to assist developers of PDF/A applications to better understand the requirements of the file format and provide implementation guidance.},
number = {1},
urldate = {2021-08-13},