1 @INPROCEEDINGS{chatnoir,
\r
2 AUTHOR = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch},
\r
3 BOOKTITLE = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)},
\r
5 EDITOR = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson},
\r
10 SITE = {Portland, Oregon},
\r
11 TITLE = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}},
\r
15 @BOOK{text_patterns,
\r
16 author = "{Mike Scott and Christopher Tribble}",
\r
17 title = "{Textual Patterns, Key Words and Corpus Analysis in Language Education}",
\r
19 publisher = "{John Benjamins Publishing Company}",
\r
25 title = "{Sketch Engine EnTenTen Corpus}",
\r
26 howpublished = "\url{http://trac.sketchengine.co.uk/wiki/Corpora/enTenTen}",
\r
30 @book{Introduction_to_information_retrieval,
\r
31 abstract = {Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective.},
\r
32 added-at = {2012-05-30T10:50:27.000+0200},
\r
33 address = {Cambridge, UK},
\r
34 author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich},
\r
35 biburl = {http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63},
\r
36 file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL},
\r
39 interhash = {b6954037b1d444f4afe4cad883b4d80c},
\r
40 intrahash = {8516d94c1f7aa1e391ddd3ace4caa23b},
\r
41 isbn = {978-0-521-86571-5},
\r
42 keywords = {v1205 book ai information retrieval language processing search xml web},
\r
43 publisher = {Cambridge University Press},
\r
44 timestamp = {2012-05-30T10:50:27.000+0200},
\r
45 title = {Introduction to Information Retrieval},
\r
46 username = {flint63},
\r
51 @INPROCEEDINGS{Knight,
\r
52 author = {Allan Knight and Kevin Almeroth and Bruce Bimber},
\r
53 title = {An Automated System for Plagiarism Detection Using the Internet},
\r
54 booktitle = {Proceedings of World Conference on Educational Multimedia, Hypermedia and Telecommunications, pp. 3619-3625},
\r
59 @INPROCEEDINGS{awfc,
\r
60 author = {Sven Meyer Zu Eissen and Benno Stein},
\r
61 title = {Intrinsic Plagiarism Detection},
\r
62 booktitle = {Proceedings of the European Conference on Information Retrieval (ECIR-06)},
\r
66 @INPROCEEDINGS{Kasprzak2008,
\r
67 AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c} and Pavel {\v S}merk",
\r
68 TITLE = "Distributed System for Discovering Similar Documents",
\r
69 SUBTITLE = "From a Relational Database to the Custom-Developed Parallel solution",
\r
70 BOOKTITLE = "ICEIS 2008: Proceedings of the Tenth International Conference on Enterprise Information Systems, Vol. DISI---Databases and Informations Systems Integration",
\r
72 publisher = "INSTICC (Institute for Systems and Technologies of Information, Control and Communication), Setúbal, Portugal",
\r
74 isbn = "978-989-8111-36-4",
\r
78 @INPROCEEDINGS{Kasprzak2009,
\r
79 AUTHOR = "Jan Kasprzak and Michal Brandejs and Jitka Brandejsov\'{a}",
\r
80 TITLE = "Distributed Aspects of the System for Discovering Similar Documents",
\r
81 BOOKTITLE = "ITA 09: Proceedings of the Third International Conference on Internet Technology and Applications",
\r
86 @INPROCEEDINGS{Kasprzak2009a,
\r
87 AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}",
\r
88 TITLE = "Finding Plagiarism by Evaluating Document Similarities",
\r
89 BOOKTITLE = "SEPLN'09: The 25th edition of the Annual Conference of the Spanish Society for Natural Language Processing",
\r
94 @INPROCEEDINGS{Monostori2002,
\r
95 author = {Kriszti\'{a}n Monostori and Raphael A. Finkel and Arkady B. Zaslavsky and G\'{a}bor Hod\'{a}sz and M\'{a}t\'{e} Pataki},
\r
96 title = {Comparison of Overlap Detection Techniques},
\r
97 booktitle = {ICCS '02: Proceedings of the International Conference on Computational Science-Part I},
\r
99 isbn = {3-540-43591-3},
\r
101 publisher = {Springer-Verlag},
\r
102 address = {London, UK},
\r
107 title = "{Czech National Archive of Graduate Theses}",
\r
108 howpublished = "\url{http://theses.cz/}",
\r
109 year = "2008--2011",
\r
114 title = "{Masaryk University Information System}",
\r
115 howpublished = "\url{http://is.muni.cz/}",
\r
116 year = "1999--2011",
\r
120 key = "{Odevzdej.CZ}",
\r
121 title = "{Odevzdej---the system for collecting seminar works}",
\r
122 howpublished = "\url{http://odevzdej.cz/}",
\r
123 year = "2009--2011",
\r
127 @inproceedings{finkel2002,
\r
128 author = {Finkel, Raphael A. and Zaslavsky, Arkady and Monostori, Kriszti\'{a}n and Schmidt, Heinz},
\r
129 title = {Signature extraction for overlap detection in documents},
\r
130 booktitle = {ACSC '02: Proceedings of the twenty-fifth Australasian conference on Computer science},
\r
132 isbn = {0-909925-82-8},
\r
134 location = {Melbourne, Victoria, Australia},
\r
135 publisher = {Australian Computer Society, Inc.},
\r
136 address = {Darlinghurst, Australia},
\r
139 @INPROCEEDINGS{broder97,
\r
140 title={On the resemblance and containment of documents},
\r
141 author={Broder, A.Z.},
\r
142 booktitle={Compression and Complexity of Sequences 1997. Proceedings},
\r
148 keywords={information retrieval, random processes, set theoryRabin fingerprints, World Wide Web, containment, documents, fixed size sample, informal notions, information retrieval, intersection problems, mathematical notions, mathematical properties, random sampling, resemblance, roughly contained, roughly the same},
\r
149 doi={10.1109/SEQUEN.1997.666900},
\r
154 author="{Rivest, R.}",
\r
155 title="{RFC1321: The MD5 Message-Digest Algorithm}",
\r
157 publisher = {RFC Editor},
\r
158 address = {United States},
\r
159 note={\url{http://www.rfc-editor.org/rfc/rfc1321.txt}},
\r
162 @Misc{britannicaplagiarism,
\r
163 author = "Encyclop\ae{}dia Britannica",
\r
164 title = "Plagiarism",
\r
165 howpublished = "retrieved 2009--08--24 from \url{http://www.britannica.com/EBchecked/topic/462640/plagiarism}",
\r
170 author = "iDnes.CZ",
\r
171 title = "Zlínského děkana usvědčili z plagiátorství",
\r
172 howpublished = "retrieved 2009--08--25 from \url{http://zpravy.idnes.cz/studium.asp?c=A080709_085836_studium_bar}",
\r
176 @inproceedings{pomikalek2008,
\r
177 author = "Pomikálek, Jan and Rychlý, Pavel",
\r
178 title = "Detecting Co-Derivative Documents in Large Text Collections",
\r
179 booktitle = "Proceedings of the Sixth International Language Resources and Evaluation (LREC'08)",
\r
182 address = "Marrakech, Morocco",
\r
183 url = "http://www.lrec-conf.org/lrec2008/"
\r
186 @inproceedings{pomikalek2009,
\r
187 author = "Pomikálek, Jan and Rychlý, Pavel and Kilgarriff, Adam",
\r
188 title = "Scaling to Billion-plus Word Corpora",
\r
189 booktitle = "Advances in Computational Linguistics",
\r
192 address = "Mexico",
\r
193 issn = "1870-4069",
\r
194 publisher = "Instituto Politécnico Nacional",
\r
199 title = "Masaryk University: Full-text search",
\r
200 howpublished = "retrieved 2009--08--25 from \url{http://www.muni.cz/general/search}",
\r
205 author = "Alan Cox",
\r
206 title = "{Alan Cox talks about laws... and Linux}",
\r
207 howpublished = "retrieved 2009--08--27 from \url{http://interviews.slashdot.org/article.pl?sid=02/05/20/1314214}",
\r
211 @inproceedings{coderivative,
\r
212 Author = {Bernstein, Y and Zobel, J},
\r
213 Title = {{A Scalable System for Identifying Co-derivative Documents}},
\r
214 Booktitle = {{String Processing and Information Retrieval, Proceedings}},
\r
215 Series = {{Lecture Notes in Computer Science}},
\r
219 Publisher = {{Springer-Verlag Berlin}},
\r
220 Type = {{Proceedings Paper}},
\r
221 Language = {{English}},
\r
222 Affiliation = {{Bernstein, Y (Reprint Author), RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.
\r
223 RMIT Univ, Sch Comp Sci \& Informat Technol, Melbourne, Vic, Australia.}},
\r
224 ISSN = {{0302-9743}},
\r
225 ISBN = {{3-540-23210-9}},
\r
226 Keywords-Plus = {{COMPRESSION}},
\r
227 Subject-Category = {{Computer Science, Theory \& Methods}},
\r
228 Author-Email = {{ybernste@cs.rmit.edu.au
\r
229 jz@cs.rmit.edu.au}},
\r
230 Number-of-Cited-References = {{17}},
\r
231 Times-Cited = {{2}},
\r
232 Doc-Delivery-Number = {{BBA15}},
\r
233 Unique-ID = {{ISI:000224377200006}},
\r
237 key = "{Turnitin.com}",
\r
238 title = "Turnitin",
\r
239 howpublished = "\url{http://turnitin.com/}, retrieved 2009--08--26",
\r
244 key = "{Copyscape.com}",
\r
245 title = "Copyscape",
\r
246 howpublished = "\url{http://copyscape.com/}, retrieved 2009--08--26",
\r
251 key = "{Doccop.com}",
\r
253 howpublished = "\url{http://doccop.com/}, retrieved 2009--08--26",
\r
257 @MISC{pan09competition,
\r
259 title = "1st International Competition on Plagiarism Detection",
\r
260 howpublished = "\url{http://www.uni-weimar.de/medien/webis/research/workshopseries/pan-09/competition.html}, retrieved 2009--08--26",
\r
264 @INPROCEEDINGS{Brin95copydetection,
\r
265 author = {Sergey Brin and James Davis and Hector Garcia-Molina},
\r
266 title = {Copy Detection Mechanisms for Digital Documents},
\r
267 booktitle = {Proceedings of the ACM SIGMOD Annual Conference},
\r
272 @INPROCEEDINGS{Shivakumar95scam,
\r
273 author = {Narayanan Shivakumar and Hector Garcia-Molina},
\r
274 title = {SCAM: A Copy Detection Mechanism for Digital Documents},
\r
275 booktitle = {Proceedings of the Second Annual Conference on the Theory and Practice of Digital Libraries},
\r
279 @INPROCEEDINGS{Garcia-Molina96dscam:finding,
\r
280 author = {Hector Garcia-Molina and Luis Gravano and Narayanan Shivakumar},
\r
281 title = {dSCAM: Finding Document Copies across Multiple Databases},
\r
282 booktitle = {In Proceedings of the 4th International Conference on Parallel and Distributed Information Systems},
\r
286 @INPROCEEDINGS{LinuxDesktop,
\r
287 author = {Jan Kasprzak},
\r
288 title = "{Desktop a jádro Linuxu}",
\r
289 booktitle = {Proceedings of the XXXI EurOpen.CZ Conference},
\r
290 isbn = "978-80-86583-13-6",
\r
292 publisher = "EurOpen.CZ, Plzeň",
\r
296 @INPROCEEDINGS{Filesystems,
\r
297 author = {Jan Kasprzak},
\r
298 title = "{Co umí souborové systémy}",
\r
299 booktitle = {Proceedings of the XXXII EurOpen.CZ Conference},
\r
300 isbn = "978-80-86583-14-3",
\r
301 pages = {105--118},
\r
302 publisher = "EurOpen.CZ, Plzeň",
\r
306 @INPROCEEDINGS{GitEuropen,
\r
307 author = {Jan Kasprzak},
\r
308 title = "{Git aneb správa verzí trochu jinak}",
\r
309 booktitle = {Proceedings of the XXXIV EurOpen.CZ Conference},
\r
310 isbn = "978-80-86583-16-7",
\r
311 pages = {107--118},
\r
312 publisher = "EurOpen.CZ, Plzeň",
\r
316 @INPROCEEDINGS{Clusters,
\r
317 author = {Jan Kasprzak},
\r
318 title = "{Clusterová řešení pod Linuxem}",
\r
319 booktitle = {SLT 2001: Proceedings of the 2nd Seminar on Linux and \TeX},
\r
320 isbn = "80-7302-009-2",
\r
321 pages = {161--168},
\r
322 publisher = "Konvoj, Brno",
\r
327 AUTHOR = {{Webis at Bauhaus-Universität Weimar} and
\r
328 {NLEL at Universidad Polytécnica de Valencia}},
\r
329 HOWPUBLISHED = {\url{http://www.webis.de/research/corpora}},
\r
330 TITLE = {{PAN Plagiarism Corpus 2009 (PAN-PC-09)}},
\r
332 NOTE = {{Martin Potthast, Andreas Eiselt, Benno Stein,
\r
333 Alberto Barrón-Cedeño, and Paolo Rosso (editors)}}
\r
336 @INPROCEEDINGS{ngram,
\r
337 author = {William B. Cavnar and John M. Trenkle},
\r
338 title = {N-Gram-Based Text Categorization},
\r
339 booktitle = {In Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis and Information Retrieval},
\r
344 @inproceedings{intrinsic,
\r
345 author = {Sven Meyer zu Eissen and Benno Stein},
\r
346 booktitle = {ECIR},
\r
347 editor = {Mounia Lalmas and Andy MacFarlane and Stefan M. Rüger and Anastasios Tombros and Theodora Tsikrika and Alexei Yavlinsky},
\r
349 publisher = {Springer},
\r
350 series = {Lecture Notes in Computer Science},
\r
351 title = {Intrinsic Plagiarism Detection.},
\r
352 url = {\url{http://dblp.uni-trier.de/db/conf/ecir/ecir2006.html#EissenS06}},
\r
355 ee = {http://dx.doi.org/10.1007/11735106_66},
\r
356 isbn = {3-540-33347-9},
\r
357 date = {2006-04-03}
\r
360 @INPROCEEDINGS{Heintze96scalabledocument,
\r
361 author = {Nevin Heintze},
\r
362 title = {Scalable Document Fingerprinting},
\r
363 booktitle = {In Proc. USENIX Workshop on Electronic Commerce},
\r
367 @inproceedings{suffixtree,
\r
368 author = {Manber, Udi and Myers, Gene},
\r
369 title = {Suffix arrays: a new method for on-line string searches},
\r
370 booktitle = {SODA '90: Proceedings of the first annual ACM-SIAM symposium on Discrete algorithms},
\r
372 isbn = {0-89871-251-3},
\r
373 pages = {319--327},
\r
374 location = {San Francisco, California, United States},
\r
375 publisher = {Society for Industrial and Applied Mathematics},
\r
376 address = {Philadelphia, PA, USA}
\r
382 AUTHOR = "Andrew Tridgell and Paul Mackerras",
\r
383 TITLE = "The rsync algorithm",
\r
384 DATE = "2004--05--19",
\r
386 INSTITUTION = "Department of Computer Science, FEIT, Australian National university",
\r
388 NOTE = "\url{http://hdl.handle.net/1885/40765}",
\r
393 AUTHOR = "Linus Torvalds et al",
\r
394 TITLE = "{Git---the Fast Version Control System}",
\r
395 HOWPUBLISHED = "\url{http://git-scm.com/}, retrieved 2011--01--12",
\r
400 TITLE = "{The RPM Package Manager}",
\r
401 HOWPUBLISHED = "\url{http://www.rpm.org/}, retrieved 2011-01--12",
\r
406 TITLE = "{DeltaRPM}",
\r
407 HOWPUBLISHED = "\url{ftp://ftp.suse.com/pub/projects/deltarpm/}, retrieved 2011--01--12",
\r
410 @mastersthesis{zazrivec,
\r
411 AUTHOR = "Milan Zázrivec",
\r
412 TITLE = "Algoritmus a implementace software pro tvorbu binárních záplat",
\r
413 SCHOOL = "Faculty of Informatics, Masaryk University",
\r
414 NOTE = "\url{http://is.muni.cz/th/60716/fi_m/}",
\r
419 KEY = "Google Scholar",
\r
420 TITLE = "{Google Scholar}",
\r
421 HOWPUBLISHED = "\url{http://scholar.google.com/}, retrieved 2011--01--12",
\r
426 TITLE = "{American National Standard for Information Systems -- Coded Character Sets -- 7-Bit American National Standard Code for Information Interchange (7-Bit ASCII), ANSI X3.4-1986}",
\r
427 DATE="1986--03--26",
\r
429 INSTITUTION = "American National Standards Institute, Inc.",
\r
433 AUTHOR = "The Unicode Consortium",
\r
434 TITLE = "{The Unicode Standard -- Version 4.0}",
\r
435 PUBLISHER = "Addison-Wesley, Boston, MA",
\r
437 ISBN = "0--321--18578--1",
\r
438 NOTE = "\url{http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode_4_0_0}",
\r
443 TITLE = "{RFC 3629: UTF-8, a transformation format of ISO 10646}",
\r
444 AUTHOR = "F. Yergeau",
\r
446 NOTE = "\url{http://tools.ietf.org/html/rfc3629}",
\r
449 @MISC{thesisproposal,
\r
450 TITLE = "{Systems for Discovering Similar Documents}",
\r
451 AUTHOR = "Jan Kasprzak",
\r
452 INSTITUTION = "Faculty of Informatics, Masaryk University",
\r
454 NOTE = "Ph.D. thesis proposal, \url{http://is.muni.cz/th/1885/fi_r/}",
\r
457 @inproceedings{Kasprzak2010,
\r
458 title={Improving the reliability of the plagiarism detection system},
\r
459 author={Kasprzak, J. and Brandejs, M.},
\r
460 booktitle={Notebook Papers of CLEF 2010 LABs and Workshops},
\r
462 organization={Citeseer}
\r
465 @article{stamatatos2011plagiarism,
\r
466 title={Plagiarism detection using stopword n-grams},
\r
467 author={Stamatatos, E.},
\r
468 journal={Journal of the American Society for Information Science and Technology},
\r
470 publisher={Wiley Online Library}
\r
472 @inproceedings{pan09stamatatos,
\r
473 author = {Efstathios Stamatatos},
\r
475 title = {Intrinsic Plagiarism Detection Using Character n-gram Profiles},
\r
476 booktitle = {Proceedings of the SEPLN'09 Workshop on Uncovering Plagiarism, Authorship and Social Software Misuse},
\r
478 location = {San Sebastian (Donostia), Spain},
\r
479 issn = {1613--0073},
\r
483 @article{zipf1935psycho,
\r
484 title={The psycho-biology of language.},
\r
485 author={Zipf, G.K.},
\r
487 publisher={Houghton, Mifflin}
\r
490 @INPROCEEDINGS{potthastframework,
\r
491 TITLE = {{An Evaluation Framework for Plagiarism Detection}
\r
493 AUTHOR = {Martin Potthast and Benno Stein and Alberot Barr{\'o}n-Cede{\~n}o and Paolo Rosso},
\r
494 BOOKTITLE = {Proceedings of the 23rd International Conference on Computational Linguistics (COLING 2010)},
\r
497 ADDRESS = {Beijing, China},
\r
498 PUBLISHER = {Association for Computational Linguistics},
\r