#!/usr/bin/env python
# generate json mapping of seguid => [protein accessions]

import collections
import hashlib
import json
import re
import sys

from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid

ref_re = re.compile(r'ref\|([^|]+)')

seguid_acs = collections.defaultdict(lambda: set())

for record in SeqIO.parse(sys.stdin, "fasta") :
    hash = seguid(record.seq.tostring())
    ref_ac = ref_re.search(record.id).group(1)
    seguid_acs[hash].add(ref_ac)

seguid_acs = dict( (k,list(v)) for k,v in seguid_acs.iteritems() )

print( json.dumps(seguid_acs, indent=2, sort_keys=True) )

## <LICENSE>
## Copyright 2014 HGVS Contributors (https://bitbucket.org/invitae/hgvs)
## 
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
## 
##     http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
## </LICENSE>
