#!/usr/bin/env escript
%% -*- erlang -*-
%%
%% Try to extract
%% Title, Authors and Abstract from PDF file
%% then archive the PDF
%% Insert Links: section a do a local copy of the file
%%
%% Assert following data structure
%%
%% Default paper directory: $HOME/Documents/papers/
%% paper index file: ?PAPERS_MD (README.md)
%% local paper copy: local/
%%
%% Check setting with env
%% PAPERS_DIR       - position of directory (may be git directory!)
%% PAPERS_LOCAL     - pdf copies
%%
%% Require programs: pdfinfo and pdftotext
%% 
-mode(compile).

-define(PAPERS_MD, "README.md").

main(Args) ->
    PapersDir = case os:getenv("PAPERS_DIR") of
		    false -> 
			filename:join([os:getenv("HOME"),"Documents","papers"]);
		    Dir1 -> Dir1
		end,
    PaperCopy = case os:getenv("PAPERS_COPY") of
		    false ->
			filename:join(PapersDir, "local");
		    Dir2 -> Dir2
		end,
    case file:make_dir(PapersDir) of
	{error,eexist} -> ok;
	ok -> ok;
	{error,Reason1} ->
	    io:format("unable to create directory ~s : ~p\n", 
		      [PapersDir, Reason1])
    end,
    case file:make_dir(PaperCopy) of
	{error,eexist} -> ok;
	ok -> ok;
	{error,Reason2} ->
	    io:format("unable to create directory ~s : ~p\n", 
		      [PaperCopy, Reason2])
    end,

    lists:foreach(
      fun(File) ->
	      case filename:extension(File) of
		  ".pdf" ->
		      pdf_archive(File,PapersDir,PaperCopy);
		  _ ->
		      io:format("paper_archive ~s not pdf file, ignored\n",
				[File])
	      end
      end, Args).

%% 
%% Look for 
%%   "Title"
%%   "Keywords"
%%   "Author"
%%

user_agent() ->
    "Mozilla/5.0 (X11; Linux i686; rv:81.0) Gecko/20100101 Firefox/81.0".

pdf_archive(Url,PapersDir,PaperCopy) ->
    U = uri_string:parse(Url),
    Scheme = maps:get(scheme, U, "file"),
    if Scheme =:= "file" ->
	    FilePath = maps:get(path, U),
	    {ok,FileBin} = file:read_file(FilePath),
	    pdf_archive_file(FilePath,FileBin,PapersDir,PaperCopy,[]);
       Scheme =:= "http"; Scheme =:= "https" ->
	    application:ensure_all_started(ssl),
	    application:ensure_all_started(inets),
	    Method = get,
	    Opts = [{body_format, binary}],
	    case httpc:request(Method, {Url,[{"User-Agent", user_agent()}]},
			       [], Opts) of
		{ok,{_Response,_Headers,Data}} ->
		    Filename = filename:basename(Url),
		    pdf_archive_file(Filename,Data,PapersDir,PaperCopy,[Url])
	    end
    end.

pdf_archive_file(FileName,FileData,PapersDir,PaperCopy,Links0) ->
    MD5 = md5sum(FileData),
    PapersFile = filename:join(PapersDir, ?PAPERS_MD),
    case next_paper_number(PapersFile, MD5) of
	{exist, N, MD5} ->
	    io:format("Exists: paper number ~w md5=~s\n", [N, MD5]),
	    1;
	{new, N, MD5} ->
	    Copy = filename:join(PaperCopy,filename:basename(FileName)),
	    case file:write_file(Copy, FileData) of
		{error, Reason} ->
		    io:format("unable to copy ~s to ~s: ~p\n", 
			      [FileName, Copy, Reason]),
		    1;
		ok ->
		    io:format("copied ~w bytes from ~s to ~s\n", 
			      [byte_size(FileData), FileName, Copy]),
		    Links = [Copy | Links0],
		    PdfInfo = pdf_info(Copy),
		    io:format("pdf info: ~p\n", [PdfInfo]),
		    FileInfo = scan_file(Copy),
		    io:format("file info: ~p\n", [FileInfo]),
		    Title = select_title(PdfInfo, FileInfo, file),
		    Author = select_author(PdfInfo, FileInfo, file),
		    Keywords = get_utf8("Keywords", PdfInfo,""),
		    Abstract = get_utf8("Abstract", FileInfo,""),
		    
		    io:format("Attributes: ~p\n", 
			      [[{title,Title},{author,Author},
				{keywords,Keywords}]]),
		    file:copy(PapersFile, PapersFile++".bak"), %% just in case?
		    case file:open(PapersFile, [append]) of
			{ok,Fd} ->
			    io:format(Fd,"# ~w: ~s\n", [N, Title]),
			    if Keywords =:= <<"">> ->
				    ok;
			       true ->
				    io:format(Fd,"## Keywords\n~s\n",
					      [Keywords])
			    end,
			    io:format(Fd,"## Author\n~s\n", [utf8(Author)]),
			    io:format(Fd,"## Abstract\n~s\n", [utf8(Abstract)]),
			    io:format(Fd,"## MD5: ~s\n", [MD5]),
			    io:format(Fd,"## Links\n", []),
			    lists:foreach(
			      fun(Link) ->
				      io:format(Fd,"* ~s\n",
						[format_md_link(Link)])
			      end, Links),
			    io:format(Fd,"\n", []),
			    file:close(Fd),
			    0;
			{error,Reason} ->
			    io:format("unable to append to file ~s: ~p\n",
				      [PapersFile, Reason]),
			    1
		    end
	    end
    end.

%% format mark down link 
%% simple <link>
%% [link](link)
%% <a href=link>link</a>
format_md_link(URI) ->
    FURI = format_link(URI),
    %% "["++URI++"]("++FURI++")"
    "<a href=\""++FURI++"\">"++URI++"</a>".

%% add file prefix if needed
format_link(URI) ->
    U = uri_string:parse(URI),    
    Scheme = maps:get(scheme, U, "file"),
    if Scheme =:= "file" ->
	    FilePath = maps:get(path, U),
	    "file://" ++ FilePath;
       true ->
	    URI
    end.
    

utf8(CharList) ->
    unicode:characters_to_binary(CharList).

get_utf8(Key, List, Default) ->
    unicode:characters_to_binary(get_val(Key,List,Default)).

get_val(Key, List, Default) ->
    case lists:keyfind(Key, 1, List) of
	false -> 
	    Default;
	{_,Value} ->
	    Value
    end.
    
%% Info1 is from pdfinfo Info2 is from pdf file
select_title(PdfInfo, FileInfo, Prefer) ->
    Title1 = get_val("Title",PdfInfo,""),
    Title2 = get_val("Title",FileInfo,""),
    case {match_title(Title1), match_title(Title2)} of
	{true,true} ->
	    if Prefer =:= file -> Title2;
	       Prefer =:= info -> Title1;
	       byte_size(Title1) > byte_size(Title2) -> Title1;
	       true -> Title2
	    end;
	{true,false} -> Title1;
	{false,true} -> Title2;
	{false,false} -> ""
    end.

%% Filter auto generated stuff from author field
match_title("") ->
    false;
match_title(Text) ->
    Len = length(Text),
    if Len =< 5 -> false;
       true ->
	    Words = ["microsoft", ".doc", "arxiv", "proceedings"],
	    case match_words(Text, Words) of
		nomatch -> true;
		{match,_} -> false
	    end
    end.

%% Info1 is from pdfinfo Info2 is from pdf file
select_author(PdfInfo, FileInfo, Prefer) ->
    Author1 = get_val("Author",PdfInfo,""),
    Author2 = get_val("Author",FileInfo,""),
    case {match_author(Author1), match_author(Author2)} of
	{true, true} ->
	    if Prefer =:= file -> Author2;
	       Prefer =:= info -> Author1;
	       byte_size(Author1) > byte_size(Author2) -> Author1;
	       true -> Author2
	    end;
	{true, false} -> Author1;
	{false, true} -> Author2;
	{false,false} -> ""
    end.

%% Filter auto generated stuff from author field
match_author("") ->
    false;
match_author(Text) ->
    Len = length(Text),
    if Len =< 5 -> 
	    false;
       true ->
	    Words = ["arxiv", "dvi", "latex", "libre", 
		     "university", "department"],
	    case match_words(Text,Words) of
		nomatch -> true;
		{match,_} -> false
	    end
    end.

match_words(Text,Words) ->
    UText = unicode:characters_to_binary(Text),
    Pattern = "("++string:join(Words, "|")++").?(.*)",
    re:run(UText, Pattern, [caseless,{capture,all_but_first,list}]).

%% extract information Title, Author and Abstract guesswork
scan_file(File) ->
    Data = os:cmd("pdftotext -layout -eol unix " ++
		      ["\""] ++ File ++["\""] ++ " -"),
    {Acc1,Data1} = scan_title(Data, []),
    {Acc2,Data2} = scan_author(Data1, Acc1),
    {Acc3,_Data3} = scan_abstract(Data2, Acc2),
    Acc3.
    
%% find first title line
scan_title(Data, Acc) ->
    case scan_match(Data, fun match_title/1) of
	{"", Data1} -> {Acc, Data1};
	{Title,Data1} -> {[{"Title",Title}|Acc], Data1}
    end.

scan_author(Data, Acc) ->
    case scan_match(Data, fun match_author/1) of
	{"", Data1} -> {Acc, Data1};
	{Author,Data1} -> {[{"Author",Author}|Acc], Data1}
    end.

scan_abstract(Data, Acc) ->
    scan_paragraph(Data, 100, "Abstract", ["abstract","preface"], Acc).

%% scan matching while Match return true and collect in Para
%% until blank line (end of paragraph) 
scan_match(Data, Match) ->
    case get_line(Data) of
	eof -> {"", []};
	{Line,Data1} ->
	    case Match(Line) of
		false ->
		    scan_match(Data1, Match);
		true ->
		    scan_match(Data1, Match, Line)
	    end
    end.

scan_match(Data, Match, Para) ->
    case get_line(Data) of
	eof -> 
	    {Para, ""};
	{Line,Data1} ->
	    case Match(Line) of
		false ->
		    {Para, Data};
		true ->
		    Para1 = merge_lines(Para, Line),
		    scan_match(Data1, Match, Para1)
	    end
    end.

%% Scan at most N lines for paragraf 
%% terminate if blank line is found
scan_paragraph(Data, 0, _Key, _StartWords, Acc) ->
    {Acc, Data};
scan_paragraph(Data, N, Key, StartWords, Acc) ->
    case get_line(Data) of
	eof ->
	    {Acc, []};
	{Line,Data1} ->
	    case match_words(Line, StartWords) of
		nomatch ->
		    scan_paragraph(Data1, N-1, Key, StartWords, Acc);
		{match,[_Match,Line1]} ->
		    paragraph_start(Data1, Key, Line1, Acc)
	    end
    end.

%% allow a single blank line after Start words
paragraph_start(Data, Key, Para, Acc) ->
    case get_line(Data) of
	eof -> {Acc, []};
	{"",Data1} when Para =:= "" ->
	    paragraph_start(Data1, Key, Para, Acc);
	{Line,Data1} ->
	    Para1 = merge_lines(Para, Line),
	    paragraph(Data1, Key, Para1, Acc)
    end.

%% collect until blank line or eof
paragraph(Data, Key, Para, Acc) ->
    case get_line(Data) of
	eof ->
	    {[{Key, Para} | Acc], []};
	{"", Data1} ->
	    {[{Key, Para} | Acc], Data1};
	{Line,Data1} ->
	    Para1 = merge_lines(Para, Line),
	    paragraph(Data1, Key, Para1, Acc)
    end.

pdf_info(File) ->
    Data = os:cmd("pdfinfo "++["\""]++File++["\""]),
    pdf_key_value(Data, []).

pdf_key_value(Data, Acc) ->
    case get_line(Data) of
	eof -> Acc;
	{Line,Data1} ->
	    {Key,Value} = pdf_key(Line),
	    pdf_key_value(Data1,[{Key, string:trim(Value)}|Acc])
    end.

pdf_key(Line) ->
    pdf_key(Line, []).

pdf_key([$:|Line], Acc) ->
    {lists:reverse(Acc), Line};
pdf_key([C|Line], Acc) ->
    pdf_key(Line, [C|Acc]);
pdf_key([], Acc) ->
    {lists:reverse(Acc), ""}.

%%
%% Scan for # <integer>: title chars
%%
next_paper_number(PaperFile, MD5) ->
    {ok,Bin} =  file:read_file(PaperFile),
    Lines = binary:split(Bin, <<"\n">>, [global]),
    scan_next_number(Lines, 0, MD5, byte_size(MD5)).

scan_next_number([Line|Lines], N, MD5, MD5Len) ->
    case Line of
	<<"## MD5: ", MD5:MD5Len/binary, _/binary>> ->
	    {exist, N, MD5};
	<<"# ",TitleLine/binary>> ->
	    case binary:split(TitleLine, <<":">>) of
		[L | _] ->
		    try binary_to_integer(string:trim(L)) of
			N1 -> scan_next_number(Lines, N1, MD5, MD5Len)
		    catch
			error:_ ->
			    scan_next_number(Lines, N, MD5, MD5Len)
		    end;
		_ ->
		    scan_next_number(Lines, N, MD5, MD5Len)
	    end;
	_ ->
	    scan_next_number(Lines, N, MD5, MD5Len)
    end;
scan_next_number([], N, MD5, _MD5Len) ->
    {new, N+1, MD5}.

md5sum(Bin) ->
    MD5Sum = erlang:md5(Bin),
    list_to_binary(bin_to_hex(MD5Sum)).

bin_to_hex(Bin) ->
    [element(X+1,{$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$a,$b,$c,$d,$e,$f}) ||
	<<X:4>> <= Bin].

%% merge two lines:
%% if Part1 ends in hypen $- (check unicode as well)
%% then 
merge_lines(Part1, Part2) ->
    case lists:reverse(Part1) of
	[$-|Part11] ->
	    lists:reverse(Part11) ++ Part2;
	_ ->
	    Part1 ++ [$\s|Part2]
    end.

%% get next line from char data as returned from pdftotext 
get_line(Data) when is_list(Data) ->
    get_line_(Data, []).

get_line_([$\n|Data], Acc) ->
    case string:trim(lists:reverse(Acc)) of
	[] when Data =:= [] -> eof;
	Line -> {Line, Data}
    end;
get_line_([C|Data], Acc) ->
    get_line_(Data, [C|Acc]);
get_line_([], Acc) ->
    case string:trim(lists:reverse(Acc)) of
	[] -> eof;
	Line -> {Line,[]}
    end.