diff --git a/README.md b/README.md index 3b76135..6e21717 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,15 @@ Wraps libraries for reading foreign file formats: * Evan Miller's Stata, SPSS, ... reader * Avik Sengupta's Excel reader -* As yet non-existent JSON reader +* JSON input/output + +# Usage Example + + using DataFrames + using DataFramesIO + + s = """[{"id":1, "val":5.5}, {"id":2, "val": 6.6}]""" + + df = json2df(s) + json = df2json(df) + df2 = json2df(json) diff --git a/src/DataFramesIO.jl b/src/DataFramesIO.jl new file mode 100644 index 0000000..e46742f --- /dev/null +++ b/src/DataFramesIO.jl @@ -0,0 +1,12 @@ +module DataFramesIO + using DataArrays + using DataFrames + using JSON + + export json2df, df2json + # export xls2df, df2xls + # export stata2df, df2stata + # export spss2df, df2spss + + include("json.jl") +end diff --git a/src/json.jl b/src/json.jl new file mode 100644 index 0000000..6778837 --- /dev/null +++ b/src/json.jl @@ -0,0 +1,51 @@ +# function tighttypes!(adf::AbstractDataFrame) +# nrows, ncols = size(adf) +# for j in 1:ncols +# T = None +# col = adf[j] +# for i in 1:nrows +# if !isna(col[i]) +# T = typejoin(T, typeof(col[i])) +# end +# end +# adf[j] = convert(DataVector{T}, col) +# end +# return +# end + +function json2df(s::String) # -> DataFrame + # TODO: Handle NA's properly + # TODO: Optimize memory access + # TODO: Implement and call tighttypes!(df) + arrayofhashes = JSON.parse(s) + nrows = length(arrayofhashes) + if nrows == 0 + return DataFrame() + end + colnames = convert(Vector{UTF8String}, collect(keys(arrayofhashes[1]))) + sort!(colnames) + # Check that keys are valid column names + ncols = length(colnames) + df = DataFrame(repeat([Any], inner = [ncols]), colnames, nrows) + for i in 1:nrows + for j in 1:ncols + df[i, j] = arrayofhashes[i][colnames[j]] + end + end + # tighttypes!(df) + clean_colnames!(df) + return df +end + +function df2json(adf::AbstractDataFrame) # -> UTF8String + nrows, ncols = size(adf) + cnames = colnames(adf) + arrayofhashes = Array(Dict{UTF8String, Any}, nrows) + for i in 1:nrows + arrayofhashes[i] = Dict{UTF8String, Any}() + for j in 1:ncols + arrayofhashes[i][cnames[j]] = adf[i, j] + end + end + return JSON.json(arrayofhashes) +end diff --git a/test/json.jl b/test/json.jl new file mode 100644 index 0000000..ab2bc26 --- /dev/null +++ b/test/json.jl @@ -0,0 +1,36 @@ +module TestJSON + using Base.Test + using DataFrames + using DataFramesIO + + s = """[ + { + "id":1, + "company":"Telstra", + "symbol":"ASX:TLS", + "price":5.27 + }, + { + "id":2, + "company":"BHP", + "symbol":"ASX:BHP", + "price":37.77 + }, + { + "id":3, + "company":"Commonwealth Bank of Australia", + "symbol":"ASX:CBA", + "price":77.58 + } + ]""" + + df = json2df(s) + @test isequal(size(df, 1), 3) + @test isequal(size(df, 2), 4) + @test isequal(colnames(df), ["company", "id", "price", "symbol"]) + @test isequal(df[3, "id"], 3) + @test isequal(df[3, "price"], 77.58) + json = df2json(df) + df2 = json2df(json) + @test df == df2 +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..1408973 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,16 @@ +# +# Correctness Tests +# + +using Base.Test +using DataFrames +using DataFramesIO + +my_tests = ["json.jl"] + +@printf "Running tests:\n" + +for my_test in my_tests + @printf " * %s\n" my_test + include(my_test) +end